[llvm] [ARM] Fix failure to register-allocate CMP_SWAP_64 pseudo-inst (PR #104039)
Oliver Stannard via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 15 01:35:25 PDT 2024
https://github.com/ostannard updated https://github.com/llvm/llvm-project/pull/104039
>From ccfd2ab0be992ba25bfdc1b4892428cd4c98c2d4 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard at arm.com>
Date: Wed, 14 Aug 2024 15:42:25 +0100
Subject: [PATCH 1/4] Re-generate tests
---
llvm/test/CodeGen/ARM/atomic-64bit.ll | 1846 ++++++++++++---
llvm/test/CodeGen/ARM/atomic-ops-v8.ll | 2904 ++++++++++++++++--------
2 files changed, 3489 insertions(+), 1261 deletions(-)
diff --git a/llvm/test/CodeGen/ARM/atomic-64bit.ll b/llvm/test/CodeGen/ARM/atomic-64bit.ll
index ab9e1dfd1cfb19..80658bdb9fda0a 100644
--- a/llvm/test/CodeGen/ARM/atomic-64bit.ll
+++ b/llvm/test/CodeGen/ARM/atomic-64bit.ll
@@ -1,219 +1,722 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabihf | FileCheck %s --check-prefix=CHECK-THUMB --check-prefix=CHECK-THUMB-LE
; RUN: llc < %s -mtriple=armebv7 -target-abi apcs | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
; RUN: llc < %s -mtriple=thumbebv7-none-linux-gnueabihf | FileCheck %s --check-prefix=CHECK-THUMB --check-prefix=CHECK-THUMB-BE
-; RUN: llc < %s -mtriple=armv7m--none-eabi | FileCheck %s --check-prefix=CHECK-M
-; RUN: llc < %s -mtriple=armv8m--none-eabi | FileCheck %s --check-prefix=CHECK-M
+; RUN: llc < %s -mtriple=armv7m--none-eabi | FileCheck %s --check-prefix=CHECK-7M
+; RUN: llc < %s -mtriple=armv8m--none-eabi | FileCheck %s --check-prefix=CHECK-8M
define i64 @test1(ptr %ptr, i64 %val) {
-; CHECK-LABEL: test1:
-; CHECK: dmb {{ish$}}
-; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK-LE: adds [[REG3:(r[0-9]?[02468])]], [[REG1]]
-; CHECK-LE: adc [[REG4:(r[0-9]?[13579])]], [[REG2]]
-; CHECK-BE: adds [[REG4:(r[0-9]?[13579])]], [[REG2]]
-; CHECK-BE: adc [[REG3:(r[0-9]?[02468])]], [[REG1]]
-; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
-; CHECK: cmp
-; CHECK: bne
-; CHECK: dmb {{ish$}}
-
-; CHECK-THUMB-LABEL: test1:
-; CHECK-THUMB: dmb {{ish$}}
-; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB-LE: adds.w [[REG3:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB-LE: adc.w [[REG4:[a-z0-9]+]], [[REG2]]
-; CHECK-THUMB-BE: adds.w [[REG4:[a-z0-9]+]], [[REG2]]
-; CHECK-THUMB-BE: adc.w [[REG3:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
-; CHECK-THUMB: cmp
-; CHECK-THUMB: bne
-; CHECK-THUMB: dmb {{ish$}}
-
-; CHECK-M: __atomic_fetch_add_8
+; CHECK-LE-LABEL: test1:
+; CHECK-LE: @ %bb.0:
+; CHECK-LE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: LBB0_1: @ %atomicrmw.start
+; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-LE-NEXT: ldrexd r4, r5, [r0]
+; CHECK-LE-NEXT: adds r6, r4, r1
+; CHECK-LE-NEXT: adc r7, r5, r2
+; CHECK-LE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-LE-NEXT: cmp r3, #0
+; CHECK-LE-NEXT: bne LBB0_1
+; CHECK-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-LE-NEXT: mov r0, r4
+; CHECK-LE-NEXT: mov r1, r5
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: pop {r4, r5, r6, r7, pc}
+;
+; CHECK-THUMB-LE-LABEL: test1:
+; CHECK-THUMB-LE: @ %bb.0:
+; CHECK-THUMB-LE-NEXT: .save {r4, r5, r7, lr}
+; CHECK-THUMB-LE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-LE-NEXT: mov r12, r0
+; CHECK-THUMB-LE-NEXT: dmb ish
+; CHECK-THUMB-LE-NEXT: .LBB0_1: @ %atomicrmw.start
+; CHECK-THUMB-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-LE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-THUMB-LE-NEXT: adds.w lr, r0, r2
+; CHECK-THUMB-LE-NEXT: adc.w r4, r1, r3
+; CHECK-THUMB-LE-NEXT: strexd r5, lr, r4, [r12]
+; CHECK-THUMB-LE-NEXT: cmp r5, #0
+; CHECK-THUMB-LE-NEXT: bne .LBB0_1
+; CHECK-THUMB-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-LE-NEXT: dmb ish
+; CHECK-THUMB-LE-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-BE-LABEL: test1:
+; CHECK-BE: @ %bb.0:
+; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: .LBB0_1: @ %atomicrmw.start
+; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: ldrexd r4, r5, [r0]
+; CHECK-BE-NEXT: adds r7, r5, r2
+; CHECK-BE-NEXT: adc r6, r4, r1
+; CHECK-BE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-BE-NEXT: cmp r3, #0
+; CHECK-BE-NEXT: bne .LBB0_1
+; CHECK-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-BE-NEXT: mov r0, r4
+; CHECK-BE-NEXT: mov r1, r5
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc}
+;
+; CHECK-THUMB-BE-LABEL: test1:
+; CHECK-THUMB-BE: @ %bb.0:
+; CHECK-THUMB-BE-NEXT: .save {r4, r5, r7, lr}
+; CHECK-THUMB-BE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-BE-NEXT: mov r12, r0
+; CHECK-THUMB-BE-NEXT: dmb ish
+; CHECK-THUMB-BE-NEXT: .LBB0_1: @ %atomicrmw.start
+; CHECK-THUMB-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-BE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-THUMB-BE-NEXT: adds.w lr, r1, r3
+; CHECK-THUMB-BE-NEXT: adc.w r4, r0, r2
+; CHECK-THUMB-BE-NEXT: strexd r5, r4, lr, [r12]
+; CHECK-THUMB-BE-NEXT: cmp r5, #0
+; CHECK-THUMB-BE-NEXT: bne .LBB0_1
+; CHECK-THUMB-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-BE-NEXT: dmb ish
+; CHECK-THUMB-BE-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-7M-LABEL: test1:
+; CHECK-7M: @ %bb.0:
+; CHECK-7M-NEXT: .save {r7, lr}
+; CHECK-7M-NEXT: push {r7, lr}
+; CHECK-7M-NEXT: .pad #8
+; CHECK-7M-NEXT: sub sp, #8
+; CHECK-7M-NEXT: movs r1, #5
+; CHECK-7M-NEXT: str r1, [sp]
+; CHECK-7M-NEXT: bl __atomic_fetch_add_8
+; CHECK-7M-NEXT: add sp, #8
+; CHECK-7M-NEXT: pop {r7, pc}
+;
+; CHECK-8M-LABEL: test1:
+; CHECK-8M: @ %bb.0:
+; CHECK-8M-NEXT: .save {r11, lr}
+; CHECK-8M-NEXT: push {r11, lr}
+; CHECK-8M-NEXT: .pad #8
+; CHECK-8M-NEXT: sub sp, sp, #8
+; CHECK-8M-NEXT: mov r1, #5
+; CHECK-8M-NEXT: str r1, [sp]
+; CHECK-8M-NEXT: bl __atomic_fetch_add_8
+; CHECK-8M-NEXT: add sp, sp, #8
+; CHECK-8M-NEXT: pop {r11, lr}
+; CHECK-8M-NEXT: mov pc, lr
+
+
%r = atomicrmw add ptr %ptr, i64 %val seq_cst
ret i64 %r
}
define i64 @test2(ptr %ptr, i64 %val) {
-; CHECK-LABEL: test2:
-; CHECK: dmb {{ish$}}
-; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK-LE: subs [[REG3:(r[0-9]?[02468])]], [[REG1]]
-; CHECK-LE: sbc [[REG4:(r[0-9]?[13579])]], [[REG2]]
-; CHECK-BE: subs [[REG4:(r[0-9]?[13579])]], [[REG2]]
-; CHECK-BE: sbc [[REG3:(r[0-9]?[02468])]], [[REG1]]
-; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
-; CHECK: cmp
-; CHECK: bne
-; CHECK: dmb {{ish$}}
-
-; CHECK-THUMB-LABEL: test2:
-; CHECK-THUMB: dmb {{ish$}}
-; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB-LE: subs.w [[REG3:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB-LE: sbc.w [[REG4:[a-z0-9]+]], [[REG2]]
-; CHECK-THUMB-BE: subs.w [[REG4:[a-z0-9]+]], [[REG2]]
-; CHECK-THUMB-BE: sbc.w [[REG3:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
-; CHECK-THUMB: cmp
-; CHECK-THUMB: bne
-; CHECK-THUMB: dmb {{ish$}}
-
-; CHECK-M: __atomic_fetch_sub_8
+; CHECK-LE-LABEL: test2:
+; CHECK-LE: @ %bb.0:
+; CHECK-LE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: LBB1_1: @ %atomicrmw.start
+; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-LE-NEXT: ldrexd r4, r5, [r0]
+; CHECK-LE-NEXT: subs r6, r4, r1
+; CHECK-LE-NEXT: sbc r7, r5, r2
+; CHECK-LE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-LE-NEXT: cmp r3, #0
+; CHECK-LE-NEXT: bne LBB1_1
+; CHECK-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-LE-NEXT: mov r0, r4
+; CHECK-LE-NEXT: mov r1, r5
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: pop {r4, r5, r6, r7, pc}
+;
+; CHECK-THUMB-LE-LABEL: test2:
+; CHECK-THUMB-LE: @ %bb.0:
+; CHECK-THUMB-LE-NEXT: .save {r4, r5, r7, lr}
+; CHECK-THUMB-LE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-LE-NEXT: mov r12, r0
+; CHECK-THUMB-LE-NEXT: dmb ish
+; CHECK-THUMB-LE-NEXT: .LBB1_1: @ %atomicrmw.start
+; CHECK-THUMB-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-LE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-THUMB-LE-NEXT: subs.w lr, r0, r2
+; CHECK-THUMB-LE-NEXT: sbc.w r4, r1, r3
+; CHECK-THUMB-LE-NEXT: strexd r5, lr, r4, [r12]
+; CHECK-THUMB-LE-NEXT: cmp r5, #0
+; CHECK-THUMB-LE-NEXT: bne .LBB1_1
+; CHECK-THUMB-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-LE-NEXT: dmb ish
+; CHECK-THUMB-LE-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-BE-LABEL: test2:
+; CHECK-BE: @ %bb.0:
+; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: .LBB1_1: @ %atomicrmw.start
+; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: ldrexd r4, r5, [r0]
+; CHECK-BE-NEXT: subs r7, r5, r2
+; CHECK-BE-NEXT: sbc r6, r4, r1
+; CHECK-BE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-BE-NEXT: cmp r3, #0
+; CHECK-BE-NEXT: bne .LBB1_1
+; CHECK-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-BE-NEXT: mov r0, r4
+; CHECK-BE-NEXT: mov r1, r5
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc}
+;
+; CHECK-THUMB-BE-LABEL: test2:
+; CHECK-THUMB-BE: @ %bb.0:
+; CHECK-THUMB-BE-NEXT: .save {r4, r5, r7, lr}
+; CHECK-THUMB-BE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-BE-NEXT: mov r12, r0
+; CHECK-THUMB-BE-NEXT: dmb ish
+; CHECK-THUMB-BE-NEXT: .LBB1_1: @ %atomicrmw.start
+; CHECK-THUMB-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-BE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-THUMB-BE-NEXT: subs.w lr, r1, r3
+; CHECK-THUMB-BE-NEXT: sbc.w r4, r0, r2
+; CHECK-THUMB-BE-NEXT: strexd r5, r4, lr, [r12]
+; CHECK-THUMB-BE-NEXT: cmp r5, #0
+; CHECK-THUMB-BE-NEXT: bne .LBB1_1
+; CHECK-THUMB-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-BE-NEXT: dmb ish
+; CHECK-THUMB-BE-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-7M-LABEL: test2:
+; CHECK-7M: @ %bb.0:
+; CHECK-7M-NEXT: .save {r7, lr}
+; CHECK-7M-NEXT: push {r7, lr}
+; CHECK-7M-NEXT: .pad #8
+; CHECK-7M-NEXT: sub sp, #8
+; CHECK-7M-NEXT: movs r1, #5
+; CHECK-7M-NEXT: str r1, [sp]
+; CHECK-7M-NEXT: bl __atomic_fetch_sub_8
+; CHECK-7M-NEXT: add sp, #8
+; CHECK-7M-NEXT: pop {r7, pc}
+;
+; CHECK-8M-LABEL: test2:
+; CHECK-8M: @ %bb.0:
+; CHECK-8M-NEXT: .save {r11, lr}
+; CHECK-8M-NEXT: push {r11, lr}
+; CHECK-8M-NEXT: .pad #8
+; CHECK-8M-NEXT: sub sp, sp, #8
+; CHECK-8M-NEXT: mov r1, #5
+; CHECK-8M-NEXT: str r1, [sp]
+; CHECK-8M-NEXT: bl __atomic_fetch_sub_8
+; CHECK-8M-NEXT: add sp, sp, #8
+; CHECK-8M-NEXT: pop {r11, lr}
+; CHECK-8M-NEXT: mov pc, lr
+
+
%r = atomicrmw sub ptr %ptr, i64 %val seq_cst
ret i64 %r
}
define i64 @test3(ptr %ptr, i64 %val) {
-; CHECK-LABEL: test3:
-; CHECK: dmb {{ish$}}
-; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK-LE-DAG: and [[REG3:(r[0-9]?[02468])]], [[REG1]],
-; CHECK-LE-DAG: and [[REG4:(r[0-9]?[13579])]], [[REG2]],
-; CHECK-BE-DAG: and [[REG4:(r[0-9]?[13579])]], [[REG2]],
-; CHECK-BE-DAG: and [[REG3:(r[0-9]?[02468])]], [[REG1]],
-; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
-; CHECK: cmp
-; CHECK: bne
-; CHECK: dmb {{ish$}}
-
+; CHECK-LE-LABEL: test3:
+; CHECK-LE: @ %bb.0:
+; CHECK-LE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: LBB2_1: @ %atomicrmw.start
+; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-LE-NEXT: ldrexd r4, r5, [r0]
+; CHECK-LE-NEXT: and r6, r4, r1
+; CHECK-LE-NEXT: and r7, r5, r2
+; CHECK-LE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-LE-NEXT: cmp r3, #0
+; CHECK-LE-NEXT: bne LBB2_1
+; CHECK-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-LE-NEXT: mov r0, r4
+; CHECK-LE-NEXT: mov r1, r5
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: pop {r4, r5, r6, r7, pc}
+;
; CHECK-THUMB-LABEL: test3:
-; CHECK-THUMB: dmb {{ish$}}
-; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB-LE-DAG: and.w [[REG3:[a-z0-9]+]], [[REG1]],
-; CHECK-THUMB-LE-DAG: and.w [[REG4:[a-z0-9]+]], [[REG2]],
-; CHECK-THUMB-BE-DAG: and.w [[REG4:[a-z0-9]+]], [[REG2]],
-; CHECK-THUMB-BE-DAG: and.w [[REG3:[a-z0-9]+]], [[REG1]],
-; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
-; CHECK-THUMB: cmp
-; CHECK-THUMB: bne
-; CHECK-THUMB: dmb {{ish$}}
-
-; CHECK-M: _atomic_fetch_and_8
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: .save {r4, r5, r7, lr}
+; CHECK-THUMB-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-NEXT: mov r12, r0
+; CHECK-THUMB-NEXT: dmb ish
+; CHECK-THUMB-NEXT: .LBB2_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrexd r0, r1, [r12]
+; CHECK-THUMB-NEXT: and.w lr, r0, r2
+; CHECK-THUMB-NEXT: and.w r4, r1, r3
+; CHECK-THUMB-NEXT: strexd r5, lr, r4, [r12]
+; CHECK-THUMB-NEXT: cmp r5, #0
+; CHECK-THUMB-NEXT: bne .LBB2_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: dmb ish
+; CHECK-THUMB-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-BE-LABEL: test3:
+; CHECK-BE: @ %bb.0:
+; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: .LBB2_1: @ %atomicrmw.start
+; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: ldrexd r4, r5, [r0]
+; CHECK-BE-NEXT: and r6, r4, r1
+; CHECK-BE-NEXT: and r7, r5, r2
+; CHECK-BE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-BE-NEXT: cmp r3, #0
+; CHECK-BE-NEXT: bne .LBB2_1
+; CHECK-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-BE-NEXT: mov r0, r4
+; CHECK-BE-NEXT: mov r1, r5
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc}
+;
+; CHECK-7M-LABEL: test3:
+; CHECK-7M: @ %bb.0:
+; CHECK-7M-NEXT: .save {r7, lr}
+; CHECK-7M-NEXT: push {r7, lr}
+; CHECK-7M-NEXT: .pad #8
+; CHECK-7M-NEXT: sub sp, #8
+; CHECK-7M-NEXT: movs r1, #5
+; CHECK-7M-NEXT: str r1, [sp]
+; CHECK-7M-NEXT: bl __atomic_fetch_and_8
+; CHECK-7M-NEXT: add sp, #8
+; CHECK-7M-NEXT: pop {r7, pc}
+;
+; CHECK-8M-LABEL: test3:
+; CHECK-8M: @ %bb.0:
+; CHECK-8M-NEXT: .save {r11, lr}
+; CHECK-8M-NEXT: push {r11, lr}
+; CHECK-8M-NEXT: .pad #8
+; CHECK-8M-NEXT: sub sp, sp, #8
+; CHECK-8M-NEXT: mov r1, #5
+; CHECK-8M-NEXT: str r1, [sp]
+; CHECK-8M-NEXT: bl __atomic_fetch_and_8
+; CHECK-8M-NEXT: add sp, sp, #8
+; CHECK-8M-NEXT: pop {r11, lr}
+; CHECK-8M-NEXT: mov pc, lr
+
+
%r = atomicrmw and ptr %ptr, i64 %val seq_cst
ret i64 %r
}
define i64 @test4(ptr %ptr, i64 %val) {
-; CHECK-LABEL: test4:
-; CHECK: dmb {{ish$}}
-; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK-LE-DAG: orr [[REG3:(r[0-9]?[02468])]], [[REG1]],
-; CHECK-LE-DAG: orr [[REG4:(r[0-9]?[13579])]], [[REG2]],
-; CHECK-BE-DAG: orr [[REG4:(r[0-9]?[13579])]], [[REG2]],
-; CHECK-BE-DAG: orr [[REG3:(r[0-9]?[02468])]], [[REG1]],
-; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
-; CHECK: cmp
-; CHECK: bne
-; CHECK: dmb {{ish$}}
-
+; CHECK-LE-LABEL: test4:
+; CHECK-LE: @ %bb.0:
+; CHECK-LE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: LBB3_1: @ %atomicrmw.start
+; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-LE-NEXT: ldrexd r4, r5, [r0]
+; CHECK-LE-NEXT: orr r6, r4, r1
+; CHECK-LE-NEXT: orr r7, r5, r2
+; CHECK-LE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-LE-NEXT: cmp r3, #0
+; CHECK-LE-NEXT: bne LBB3_1
+; CHECK-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-LE-NEXT: mov r0, r4
+; CHECK-LE-NEXT: mov r1, r5
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: pop {r4, r5, r6, r7, pc}
+;
; CHECK-THUMB-LABEL: test4:
-; CHECK-THUMB: dmb {{ish$}}
-; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB-LE-DAG: orr.w [[REG3:[a-z0-9]+]], [[REG1]],
-; CHECK-THUMB-LE-DAG: orr.w [[REG4:[a-z0-9]+]], [[REG2]],
-; CHECK-THUMB-BE-DAG: orr.w [[REG4:[a-z0-9]+]], [[REG2]],
-; CHECK-THUMB-BE-DAG: orr.w [[REG3:[a-z0-9]+]], [[REG1]],
-; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
-; CHECK-THUMB: cmp
-; CHECK-THUMB: bne
-; CHECK-THUMB: dmb {{ish$}}
-
-; CHECK-M: __atomic_fetch_or_8
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: .save {r4, r5, r7, lr}
+; CHECK-THUMB-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-NEXT: mov r12, r0
+; CHECK-THUMB-NEXT: dmb ish
+; CHECK-THUMB-NEXT: .LBB3_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrexd r0, r1, [r12]
+; CHECK-THUMB-NEXT: orr.w lr, r0, r2
+; CHECK-THUMB-NEXT: orr.w r4, r1, r3
+; CHECK-THUMB-NEXT: strexd r5, lr, r4, [r12]
+; CHECK-THUMB-NEXT: cmp r5, #0
+; CHECK-THUMB-NEXT: bne .LBB3_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: dmb ish
+; CHECK-THUMB-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-BE-LABEL: test4:
+; CHECK-BE: @ %bb.0:
+; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: .LBB3_1: @ %atomicrmw.start
+; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: ldrexd r4, r5, [r0]
+; CHECK-BE-NEXT: orr r6, r4, r1
+; CHECK-BE-NEXT: orr r7, r5, r2
+; CHECK-BE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-BE-NEXT: cmp r3, #0
+; CHECK-BE-NEXT: bne .LBB3_1
+; CHECK-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-BE-NEXT: mov r0, r4
+; CHECK-BE-NEXT: mov r1, r5
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc}
+;
+; CHECK-7M-LABEL: test4:
+; CHECK-7M: @ %bb.0:
+; CHECK-7M-NEXT: .save {r7, lr}
+; CHECK-7M-NEXT: push {r7, lr}
+; CHECK-7M-NEXT: .pad #8
+; CHECK-7M-NEXT: sub sp, #8
+; CHECK-7M-NEXT: movs r1, #5
+; CHECK-7M-NEXT: str r1, [sp]
+; CHECK-7M-NEXT: bl __atomic_fetch_or_8
+; CHECK-7M-NEXT: add sp, #8
+; CHECK-7M-NEXT: pop {r7, pc}
+;
+; CHECK-8M-LABEL: test4:
+; CHECK-8M: @ %bb.0:
+; CHECK-8M-NEXT: .save {r11, lr}
+; CHECK-8M-NEXT: push {r11, lr}
+; CHECK-8M-NEXT: .pad #8
+; CHECK-8M-NEXT: sub sp, sp, #8
+; CHECK-8M-NEXT: mov r1, #5
+; CHECK-8M-NEXT: str r1, [sp]
+; CHECK-8M-NEXT: bl __atomic_fetch_or_8
+; CHECK-8M-NEXT: add sp, sp, #8
+; CHECK-8M-NEXT: pop {r11, lr}
+; CHECK-8M-NEXT: mov pc, lr
+
+
%r = atomicrmw or ptr %ptr, i64 %val seq_cst
ret i64 %r
}
define i64 @test5(ptr %ptr, i64 %val) {
-; CHECK-LABEL: test5:
-; CHECK: dmb {{ish$}}
-; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK-LE-DAG: eor [[REG3:(r[0-9]?[02468])]], [[REG1]],
-; CHECK-LE-DAG: eor [[REG4:(r[0-9]?[13579])]], [[REG2]],
-; CHECK-BE-DAG: eor [[REG4:(r[0-9]?[13579])]], [[REG2]],
-; CHECK-BE-DAG: eor [[REG3:(r[0-9]?[02468])]], [[REG1]],
-; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
-; CHECK: cmp
-; CHECK: bne
-; CHECK: dmb {{ish$}}
-
+; CHECK-LE-LABEL: test5:
+; CHECK-LE: @ %bb.0:
+; CHECK-LE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: LBB4_1: @ %atomicrmw.start
+; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-LE-NEXT: ldrexd r4, r5, [r0]
+; CHECK-LE-NEXT: eor r6, r4, r1
+; CHECK-LE-NEXT: eor r7, r5, r2
+; CHECK-LE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-LE-NEXT: cmp r3, #0
+; CHECK-LE-NEXT: bne LBB4_1
+; CHECK-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-LE-NEXT: mov r0, r4
+; CHECK-LE-NEXT: mov r1, r5
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: pop {r4, r5, r6, r7, pc}
+;
; CHECK-THUMB-LABEL: test5:
-; CHECK-THUMB: dmb {{ish$}}
-; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB-LE-DAG: eor.w [[REG3:[a-z0-9]+]], [[REG1]],
-; CHECK-THUMB-LE-DAG: eor.w [[REG4:[a-z0-9]+]], [[REG2]],
-; CHECK-THUMB-BE-DAG: eor.w [[REG4:[a-z0-9]+]], [[REG2]],
-; CHECK-THUMB-BE-DAG: eor.w [[REG3:[a-z0-9]+]], [[REG1]],
-; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
-; CHECK-THUMB: cmp
-; CHECK-THUMB: bne
-; CHECK-THUMB: dmb {{ish$}}
-
-; CHECK-M: __atomic_fetch_xor_8
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: .save {r4, r5, r7, lr}
+; CHECK-THUMB-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-NEXT: mov r12, r0
+; CHECK-THUMB-NEXT: dmb ish
+; CHECK-THUMB-NEXT: .LBB4_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrexd r0, r1, [r12]
+; CHECK-THUMB-NEXT: eor.w lr, r0, r2
+; CHECK-THUMB-NEXT: eor.w r4, r1, r3
+; CHECK-THUMB-NEXT: strexd r5, lr, r4, [r12]
+; CHECK-THUMB-NEXT: cmp r5, #0
+; CHECK-THUMB-NEXT: bne .LBB4_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: dmb ish
+; CHECK-THUMB-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-BE-LABEL: test5:
+; CHECK-BE: @ %bb.0:
+; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: .LBB4_1: @ %atomicrmw.start
+; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: ldrexd r4, r5, [r0]
+; CHECK-BE-NEXT: eor r6, r4, r1
+; CHECK-BE-NEXT: eor r7, r5, r2
+; CHECK-BE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-BE-NEXT: cmp r3, #0
+; CHECK-BE-NEXT: bne .LBB4_1
+; CHECK-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-BE-NEXT: mov r0, r4
+; CHECK-BE-NEXT: mov r1, r5
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc}
+;
+; CHECK-7M-LABEL: test5:
+; CHECK-7M: @ %bb.0:
+; CHECK-7M-NEXT: .save {r7, lr}
+; CHECK-7M-NEXT: push {r7, lr}
+; CHECK-7M-NEXT: .pad #8
+; CHECK-7M-NEXT: sub sp, #8
+; CHECK-7M-NEXT: movs r1, #5
+; CHECK-7M-NEXT: str r1, [sp]
+; CHECK-7M-NEXT: bl __atomic_fetch_xor_8
+; CHECK-7M-NEXT: add sp, #8
+; CHECK-7M-NEXT: pop {r7, pc}
+;
+; CHECK-8M-LABEL: test5:
+; CHECK-8M: @ %bb.0:
+; CHECK-8M-NEXT: .save {r11, lr}
+; CHECK-8M-NEXT: push {r11, lr}
+; CHECK-8M-NEXT: .pad #8
+; CHECK-8M-NEXT: sub sp, sp, #8
+; CHECK-8M-NEXT: mov r1, #5
+; CHECK-8M-NEXT: str r1, [sp]
+; CHECK-8M-NEXT: bl __atomic_fetch_xor_8
+; CHECK-8M-NEXT: add sp, sp, #8
+; CHECK-8M-NEXT: pop {r11, lr}
+; CHECK-8M-NEXT: mov pc, lr
+
+
%r = atomicrmw xor ptr %ptr, i64 %val seq_cst
ret i64 %r
}
define i64 @test6(ptr %ptr, i64 %val) {
-; CHECK-LABEL: test6:
-; CHECK: dmb {{ish$}}
-; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: strexd {{[a-z0-9]+}}, {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}
-; CHECK: cmp
-; CHECK: bne
-; CHECK: dmb {{ish$}}
-
+; CHECK-LE-LABEL: test6:
+; CHECK-LE: @ %bb.0:
+; CHECK-LE-NEXT: push {r4, r5, lr}
+; CHECK-LE-NEXT: mov r5, r2
+; CHECK-LE-NEXT: mov r2, r0
+; CHECK-LE-NEXT: mov r4, r1
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: LBB5_1: @ %atomicrmw.start
+; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-LE-NEXT: ldrexd r0, r1, [r2]
+; CHECK-LE-NEXT: strexd r3, r4, r5, [r2]
+; CHECK-LE-NEXT: cmp r3, #0
+; CHECK-LE-NEXT: bne LBB5_1
+; CHECK-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: pop {r4, r5, pc}
+;
; CHECK-THUMB-LABEL: test6:
-; CHECK-THUMB: dmb {{ish$}}
-; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}
-; CHECK-THUMB: cmp
-; CHECK-THUMB: bne
-; CHECK-THUMB: dmb {{ish$}}
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: .save {r7, lr}
+; CHECK-THUMB-NEXT: push {r7, lr}
+; CHECK-THUMB-NEXT: dmb ish
+; CHECK-THUMB-NEXT: .LBB5_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrexd r12, r1, [r0]
+; CHECK-THUMB-NEXT: strexd lr, r2, r3, [r0]
+; CHECK-THUMB-NEXT: cmp.w lr, #0
+; CHECK-THUMB-NEXT: bne .LBB5_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r12
+; CHECK-THUMB-NEXT: dmb ish
+; CHECK-THUMB-NEXT: pop {r7, pc}
+;
+; CHECK-BE-LABEL: test6:
+; CHECK-BE: @ %bb.0:
+; CHECK-BE-NEXT: push {r4, r5, lr}
+; CHECK-BE-NEXT: mov r5, r2
+; CHECK-BE-NEXT: mov r2, r0
+; CHECK-BE-NEXT: mov r4, r1
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: .LBB5_1: @ %atomicrmw.start
+; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: ldrexd r0, r1, [r2]
+; CHECK-BE-NEXT: strexd r3, r4, r5, [r2]
+; CHECK-BE-NEXT: cmp r3, #0
+; CHECK-BE-NEXT: bne .LBB5_1
+; CHECK-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: pop {r4, r5, pc}
+;
+; CHECK-7M-LABEL: test6:
+; CHECK-7M: @ %bb.0:
+; CHECK-7M-NEXT: .save {r7, lr}
+; CHECK-7M-NEXT: push {r7, lr}
+; CHECK-7M-NEXT: .pad #8
+; CHECK-7M-NEXT: sub sp, #8
+; CHECK-7M-NEXT: movs r1, #5
+; CHECK-7M-NEXT: str r1, [sp]
+; CHECK-7M-NEXT: bl __atomic_exchange_8
+; CHECK-7M-NEXT: add sp, #8
+; CHECK-7M-NEXT: pop {r7, pc}
+;
+; CHECK-8M-LABEL: test6:
+; CHECK-8M: @ %bb.0:
+; CHECK-8M-NEXT: .save {r11, lr}
+; CHECK-8M-NEXT: push {r11, lr}
+; CHECK-8M-NEXT: .pad #8
+; CHECK-8M-NEXT: sub sp, sp, #8
+; CHECK-8M-NEXT: mov r1, #5
+; CHECK-8M-NEXT: str r1, [sp]
+; CHECK-8M-NEXT: bl __atomic_exchange_8
+; CHECK-8M-NEXT: add sp, sp, #8
+; CHECK-8M-NEXT: pop {r11, lr}
+; CHECK-8M-NEXT: mov pc, lr
+
-; CHECK-M: __atomic_exchange_8
%r = atomicrmw xchg ptr %ptr, i64 %val seq_cst
ret i64 %r
}
define i64 @test7(ptr %ptr, i64 %val1, i64 %val2) {
-; CHECK-LABEL: test7:
-; CHECK-DAG: mov [[VAL1LO:r[0-9]+]], r1
-; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK-LE-DAG: eor [[MISMATCH_LO:.*]], [[REG1]], [[VAL1LO]]
-; CHECK-LE-DAG: eor [[MISMATCH_HI:.*]], [[REG2]], r2
-; CHECK-BE-DAG: eor [[MISMATCH_LO:.*]], [[REG2]], r2
-; CHECK-BE-DAG: eor [[MISMATCH_HI:.*]], [[REG1]], r1
-; CHECK: orrs {{r[0-9]+}}, [[MISMATCH_LO]], [[MISMATCH_HI]]
-; CHECK: bne
-; CHECK-DAG: dmb {{ish$}}
-; CHECK: strexd {{[a-z0-9]+}}, {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}
-; CHECK: cmp
-; CHECK: beq
-; CHECK: dmb {{ish$}}
-
-; CHECK-THUMB-LABEL: test7:
-; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB-LE-DAG: eor.w [[MISMATCH_LO:[a-z0-9]+]], [[REG1]], r2
-; CHECK-THUMB-LE-DAG: eor.w [[MISMATCH_HI:[a-z0-9]+]], [[REG2]], r3
-; CHECK-THUMB-BE-DAG: eor.w [[MISMATCH_HI:[a-z0-9]+]], [[REG1]], r2
-; CHECK-THUMB-BE-DAG: eor.w [[MISMATCH_LO:[a-z0-9]+]], [[REG2]], r3
-; CHECK-THUMB-LE: orrs.w {{.*}}, [[MISMATCH_LO]], [[MISMATCH_HI]]
-; CHECK-THUMB: bne
-; CHECK-THUMB: dmb {{ish$}}
-; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}
-; CHECK-THUMB: cmp
-; CHECK-THUMB: beq
-; CHECK-THUMB: dmb {{ish$}}
-
-; CHECK-M: __atomic_compare_exchange_8
+; CHECK-LE-LABEL: test7:
+; CHECK-LE: @ %bb.0: @ %cmpxchg.start
+; CHECK-LE-NEXT: push {r4, r6, r10, r11, lr}
+; CHECK-LE-NEXT: mov r10, r3
+; CHECK-LE-NEXT: mov r9, r1
+; CHECK-LE-NEXT: mov r3, r0
+; CHECK-LE-NEXT: ldrexd r0, r1, [r0]
+; CHECK-LE-NEXT: eor r12, r1, r2
+; CHECK-LE-NEXT: eor r4, r0, r9
+; CHECK-LE-NEXT: orrs r4, r4, r12
+; CHECK-LE-NEXT: bne LBB6_4
+; CHECK-LE-NEXT: @ %bb.1: @ %cmpxchg.fencedstore
+; CHECK-LE-NEXT: ldr r11, [sp, #20]
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: LBB6_2: @ %cmpxchg.trystore
+; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-LE-NEXT: strexd r4, r10, r11, [r3]
+; CHECK-LE-NEXT: cmp r4, #0
+; CHECK-LE-NEXT: beq LBB6_5
+; CHECK-LE-NEXT: @ %bb.3: @ %cmpxchg.releasedload
+; CHECK-LE-NEXT: @ in Loop: Header=BB6_2 Depth=1
+; CHECK-LE-NEXT: ldrexd r0, r1, [r3]
+; CHECK-LE-NEXT: eor r4, r0, r9
+; CHECK-LE-NEXT: eor r6, r1, r2
+; CHECK-LE-NEXT: orrs r6, r4, r6
+; CHECK-LE-NEXT: beq LBB6_2
+; CHECK-LE-NEXT: LBB6_4: @ %cmpxchg.nostore
+; CHECK-LE-NEXT: clrex
+; CHECK-LE-NEXT: LBB6_5: @ %cmpxchg.end
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: pop {r4, r6, r10, r11, pc}
+;
+; CHECK-THUMB-LE-LABEL: test7:
+; CHECK-THUMB-LE: @ %bb.0: @ %cmpxchg.start
+; CHECK-THUMB-LE-NEXT: .save {r4, r5, r6, lr}
+; CHECK-THUMB-LE-NEXT: push {r4, r5, r6, lr}
+; CHECK-THUMB-LE-NEXT: mov r12, r0
+; CHECK-THUMB-LE-NEXT: ldrexd r0, r1, [r0]
+; CHECK-THUMB-LE-NEXT: eor.w lr, r1, r3
+; CHECK-THUMB-LE-NEXT: eor.w r4, r0, r2
+; CHECK-THUMB-LE-NEXT: orrs.w r4, r4, lr
+; CHECK-THUMB-LE-NEXT: bne .LBB6_4
+; CHECK-THUMB-LE-NEXT: @ %bb.1: @ %cmpxchg.fencedstore
+; CHECK-THUMB-LE-NEXT: ldrd r4, lr, [sp, #16]
+; CHECK-THUMB-LE-NEXT: dmb ish
+; CHECK-THUMB-LE-NEXT: .LBB6_2: @ %cmpxchg.trystore
+; CHECK-THUMB-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-LE-NEXT: strexd r5, r4, lr, [r12]
+; CHECK-THUMB-LE-NEXT: cbz r5, .LBB6_5
+; CHECK-THUMB-LE-NEXT: @ %bb.3: @ %cmpxchg.releasedload
+; CHECK-THUMB-LE-NEXT: @ in Loop: Header=BB6_2 Depth=1
+; CHECK-THUMB-LE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-THUMB-LE-NEXT: eor.w r5, r0, r2
+; CHECK-THUMB-LE-NEXT: eor.w r6, r1, r3
+; CHECK-THUMB-LE-NEXT: orrs r5, r6
+; CHECK-THUMB-LE-NEXT: beq .LBB6_2
+; CHECK-THUMB-LE-NEXT: .LBB6_4: @ %cmpxchg.nostore
+; CHECK-THUMB-LE-NEXT: clrex
+; CHECK-THUMB-LE-NEXT: .LBB6_5: @ %cmpxchg.end
+; CHECK-THUMB-LE-NEXT: dmb ish
+; CHECK-THUMB-LE-NEXT: pop {r4, r5, r6, pc}
+;
+; CHECK-BE-LABEL: test7:
+; CHECK-BE: @ %bb.0: @ %cmpxchg.start
+; CHECK-BE-NEXT: push {r4, r6, r8, r9, lr}
+; CHECK-BE-NEXT: mov r8, r3
+; CHECK-BE-NEXT: mov r12, r1
+; CHECK-BE-NEXT: mov r3, r0
+; CHECK-BE-NEXT: ldrexd r0, r1, [r0]
+; CHECK-BE-NEXT: eor lr, r0, r12
+; CHECK-BE-NEXT: eor r6, r1, r2
+; CHECK-BE-NEXT: orrs r6, r6, lr
+; CHECK-BE-NEXT: bne .LBB6_4
+; CHECK-BE-NEXT: @ %bb.1: @ %cmpxchg.fencedstore
+; CHECK-BE-NEXT: ldr r9, [sp, #20]
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: .LBB6_2: @ %cmpxchg.trystore
+; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: strexd r6, r8, r9, [r3]
+; CHECK-BE-NEXT: cmp r6, #0
+; CHECK-BE-NEXT: beq .LBB6_5
+; CHECK-BE-NEXT: @ %bb.3: @ %cmpxchg.releasedload
+; CHECK-BE-NEXT: @ in Loop: Header=BB6_2 Depth=1
+; CHECK-BE-NEXT: ldrexd r0, r1, [r3]
+; CHECK-BE-NEXT: eor r6, r0, r12
+; CHECK-BE-NEXT: eor r4, r1, r2
+; CHECK-BE-NEXT: orrs r6, r4, r6
+; CHECK-BE-NEXT: beq .LBB6_2
+; CHECK-BE-NEXT: .LBB6_4: @ %cmpxchg.nostore
+; CHECK-BE-NEXT: clrex
+; CHECK-BE-NEXT: .LBB6_5: @ %cmpxchg.end
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: pop {r4, r6, r8, r9, pc}
+;
+; CHECK-THUMB-BE-LABEL: test7:
+; CHECK-THUMB-BE: @ %bb.0: @ %cmpxchg.start
+; CHECK-THUMB-BE-NEXT: .save {r4, r5, r6, lr}
+; CHECK-THUMB-BE-NEXT: push {r4, r5, r6, lr}
+; CHECK-THUMB-BE-NEXT: mov r12, r0
+; CHECK-THUMB-BE-NEXT: ldrexd r0, r1, [r0]
+; CHECK-THUMB-BE-NEXT: eor.w lr, r0, r2
+; CHECK-THUMB-BE-NEXT: eor.w r4, r1, r3
+; CHECK-THUMB-BE-NEXT: orrs.w r4, r4, lr
+; CHECK-THUMB-BE-NEXT: bne .LBB6_4
+; CHECK-THUMB-BE-NEXT: @ %bb.1: @ %cmpxchg.fencedstore
+; CHECK-THUMB-BE-NEXT: ldrd r4, lr, [sp, #16]
+; CHECK-THUMB-BE-NEXT: dmb ish
+; CHECK-THUMB-BE-NEXT: .LBB6_2: @ %cmpxchg.trystore
+; CHECK-THUMB-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-BE-NEXT: strexd r5, r4, lr, [r12]
+; CHECK-THUMB-BE-NEXT: cbz r5, .LBB6_5
+; CHECK-THUMB-BE-NEXT: @ %bb.3: @ %cmpxchg.releasedload
+; CHECK-THUMB-BE-NEXT: @ in Loop: Header=BB6_2 Depth=1
+; CHECK-THUMB-BE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-THUMB-BE-NEXT: eor.w r5, r0, r2
+; CHECK-THUMB-BE-NEXT: eor.w r6, r1, r3
+; CHECK-THUMB-BE-NEXT: orrs r5, r6
+; CHECK-THUMB-BE-NEXT: beq .LBB6_2
+; CHECK-THUMB-BE-NEXT: .LBB6_4: @ %cmpxchg.nostore
+; CHECK-THUMB-BE-NEXT: clrex
+; CHECK-THUMB-BE-NEXT: .LBB6_5: @ %cmpxchg.end
+; CHECK-THUMB-BE-NEXT: dmb ish
+; CHECK-THUMB-BE-NEXT: pop {r4, r5, r6, pc}
+;
+; CHECK-7M-LABEL: test7:
+; CHECK-7M: @ %bb.0:
+; CHECK-7M-NEXT: .save {r7, lr}
+; CHECK-7M-NEXT: push {r7, lr}
+; CHECK-7M-NEXT: .pad #16
+; CHECK-7M-NEXT: sub sp, #16
+; CHECK-7M-NEXT: strd r2, r3, [sp, #8]
+; CHECK-7M-NEXT: movs r1, #5
+; CHECK-7M-NEXT: ldrd r2, r3, [sp, #24]
+; CHECK-7M-NEXT: strd r1, r1, [sp]
+; CHECK-7M-NEXT: add r1, sp, #8
+; CHECK-7M-NEXT: bl __atomic_compare_exchange_8
+; CHECK-7M-NEXT: ldrd r0, r1, [sp, #8]
+; CHECK-7M-NEXT: add sp, #16
+; CHECK-7M-NEXT: pop {r7, pc}
+;
+; CHECK-8M-LABEL: test7:
+; CHECK-8M: @ %bb.0:
+; CHECK-8M-NEXT: .save {r11, lr}
+; CHECK-8M-NEXT: push {r11, lr}
+; CHECK-8M-NEXT: .pad #16
+; CHECK-8M-NEXT: sub sp, sp, #16
+; CHECK-8M-NEXT: mov r1, #5
+; CHECK-8M-NEXT: str r3, [sp, #12]
+; CHECK-8M-NEXT: str r2, [sp, #8]
+; CHECK-8M-NEXT: ldr r2, [sp, #24]
+; CHECK-8M-NEXT: ldr r3, [sp, #28]
+; CHECK-8M-NEXT: str r1, [sp]
+; CHECK-8M-NEXT: str r1, [sp, #4]
+; CHECK-8M-NEXT: add r1, sp, #8
+; CHECK-8M-NEXT: bl __atomic_compare_exchange_8
+; CHECK-8M-NEXT: ldr r0, [sp, #8]
+; CHECK-8M-NEXT: ldr r1, [sp, #12]
+; CHECK-8M-NEXT: add sp, sp, #16
+; CHECK-8M-NEXT: pop {r11, lr}
+; CHECK-8M-NEXT: mov pc, lr
+
+
%pair = cmpxchg ptr %ptr, i64 %val1, i64 %val2 seq_cst seq_cst
%r = extractvalue { i64, i1 } %pair, 0
@@ -224,20 +727,37 @@ define i64 @test7(ptr %ptr, i64 %val1, i64 %val2) {
; isn't supported.
define i64 @test8(ptr %ptr) {
; CHECK-LABEL: test8:
-; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK-NOT: strexd
-; CHECK: clrex
-; CHECK-NOT: strexd
-; CHECK: dmb {{ish$}}
-
+; CHECK: @ %bb.0:
+; CHECK-NEXT: ldrexd r0, r1, [r0]
+; CHECK-NEXT: clrex
+; CHECK-NEXT: dmb ish
+; CHECK-NEXT: bx lr
+;
; CHECK-THUMB-LABEL: test8:
-; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB-NOT: strexd
-; CHECK-THUMB: clrex
-; CHECK-THUMB-NOT: strexd
-; CHECK-THUMB: dmb {{ish$}}
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: ldrexd r0, r1, [r0]
+; CHECK-THUMB-NEXT: clrex
+; CHECK-THUMB-NEXT: dmb ish
+; CHECK-THUMB-NEXT: bx lr
+;
+; CHECK-7M-LABEL: test8:
+; CHECK-7M: @ %bb.0:
+; CHECK-7M-NEXT: .save {r7, lr}
+; CHECK-7M-NEXT: push {r7, lr}
+; CHECK-7M-NEXT: movs r1, #5
+; CHECK-7M-NEXT: bl __atomic_load_8
+; CHECK-7M-NEXT: pop {r7, pc}
+;
+; CHECK-8M-LABEL: test8:
+; CHECK-8M: @ %bb.0:
+; CHECK-8M-NEXT: .save {r11, lr}
+; CHECK-8M-NEXT: push {r11, lr}
+; CHECK-8M-NEXT: mov r1, #5
+; CHECK-8M-NEXT: bl __atomic_load_8
+; CHECK-8M-NEXT: pop {r11, lr}
+; CHECK-8M-NEXT: mov pc, lr
+
-; CHECK-M: __atomic_load_8
%r = load atomic i64, ptr %ptr seq_cst, align 8
ret i64 %r
@@ -247,203 +767,825 @@ define i64 @test8(ptr %ptr) {
; way to write it. Except on M class devices, where ldrexd/strexd aren't
; supported.
define void @test9(ptr %ptr, i64 %val) {
-; CHECK-LABEL: test9:
-; CHECK: dmb {{ish$}}
-; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: strexd {{[a-z0-9]+}}, {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}
-; CHECK: cmp
-; CHECK: bne
-; CHECK: dmb {{ish$}}
-
+; CHECK-LE-LABEL: test9:
+; CHECK-LE: @ %bb.0:
+; CHECK-LE-NEXT: push {r4, r5, lr}
+; CHECK-LE-NEXT: mov r3, r2
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: mov r2, r1
+; CHECK-LE-NEXT: LBB8_1: @ %atomicrmw.start
+; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-LE-NEXT: ldrexd r4, r5, [r0]
+; CHECK-LE-NEXT: strexd r1, r2, r3, [r0]
+; CHECK-LE-NEXT: cmp r1, #0
+; CHECK-LE-NEXT: bne LBB8_1
+; CHECK-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: pop {r4, r5, pc}
+;
; CHECK-THUMB-LABEL: test9:
-; CHECK-THUMB: dmb {{ish$}}
-; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}
-; CHECK-THUMB: cmp
-; CHECK-THUMB: bne
-; CHECK-THUMB: dmb {{ish$}}
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: dmb ish
+; CHECK-THUMB-NEXT: .LBB8_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrexd r1, r12, [r0]
+; CHECK-THUMB-NEXT: strexd r1, r2, r3, [r0]
+; CHECK-THUMB-NEXT: cmp r1, #0
+; CHECK-THUMB-NEXT: bne .LBB8_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: dmb ish
+; CHECK-THUMB-NEXT: bx lr
+;
+; CHECK-BE-LABEL: test9:
+; CHECK-BE: @ %bb.0:
+; CHECK-BE-NEXT: push {r4, r5, lr}
+; CHECK-BE-NEXT: mov r3, r2
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: mov r2, r1
+; CHECK-BE-NEXT: .LBB8_1: @ %atomicrmw.start
+; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: ldrexd r4, r5, [r0]
+; CHECK-BE-NEXT: strexd r1, r2, r3, [r0]
+; CHECK-BE-NEXT: cmp r1, #0
+; CHECK-BE-NEXT: bne .LBB8_1
+; CHECK-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: pop {r4, r5, pc}
+;
+; CHECK-7M-LABEL: test9:
+; CHECK-7M: @ %bb.0:
+; CHECK-7M-NEXT: .save {r7, lr}
+; CHECK-7M-NEXT: push {r7, lr}
+; CHECK-7M-NEXT: .pad #8
+; CHECK-7M-NEXT: sub sp, #8
+; CHECK-7M-NEXT: movs r1, #5
+; CHECK-7M-NEXT: str r1, [sp]
+; CHECK-7M-NEXT: bl __atomic_store_8
+; CHECK-7M-NEXT: add sp, #8
+; CHECK-7M-NEXT: pop {r7, pc}
+;
+; CHECK-8M-LABEL: test9:
+; CHECK-8M: @ %bb.0:
+; CHECK-8M-NEXT: .save {r11, lr}
+; CHECK-8M-NEXT: push {r11, lr}
+; CHECK-8M-NEXT: .pad #8
+; CHECK-8M-NEXT: sub sp, sp, #8
+; CHECK-8M-NEXT: mov r1, #5
+; CHECK-8M-NEXT: str r1, [sp]
+; CHECK-8M-NEXT: bl __atomic_store_8
+; CHECK-8M-NEXT: add sp, sp, #8
+; CHECK-8M-NEXT: pop {r11, lr}
+; CHECK-8M-NEXT: mov pc, lr
+
-; CHECK-M: __atomic_store_8
store atomic i64 %val, ptr %ptr seq_cst, align 8
ret void
}
define i64 @test10(ptr %ptr, i64 %val) {
-; CHECK-LABEL: test10:
-; CHECK: dmb {{ish$}}
-; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: mov [[OUT_HI:[a-z0-9]+]], r2
-; CHECK-LE: subs {{[^,]+}}, r1, [[REG1]]
-; CHECK-BE: subs {{[^,]+}}, r2, [[REG2]]
-; CHECK-LE: sbcs {{[^,]+}}, r2, [[REG2]]
-; CHECK-BE: sbcs {{[^,]+}}, r1, [[REG1]]
-; CHECK: mov [[CMP:[a-z0-9]+]], #0
-; CHECK: movwge [[CMP]], #1
-; CHECK: cmp [[CMP]], #0
-; CHECK: movne [[OUT_HI]], [[REG2]]
-; CHECK: mov [[OUT_LO:[a-z0-9]+]], r1
-; CHECK: movne [[OUT_LO]], [[REG1]]
-; CHECK: strexd {{[a-z0-9]+}}, [[OUT_LO]], [[OUT_HI]]
-; CHECK: cmp
-; CHECK: bne
-; CHECK: dmb {{ish$}}
-
-; CHECK-THUMB-LABEL: test10:
-; CHECK-THUMB: dmb {{ish$}}
-; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: mov [[OUT_LO:[a-z0-9]+]], r2
-; CHECK-THUMB-LE: subs.w {{[^,]+}}, r2, [[REG1]]
-; CHECK-THUMB-BE: subs.w {{[^,]+}}, r3, [[REG2]]
-; CHECK-THUMB-LE: sbcs.w {{[^,]+}}, r3, [[REG2]]
-; CHECK-THUMB-BE: sbcs.w {{[^,]+}}, r2, [[REG1]]
-; CHECK-THUMB: mov.w [[CMP:[a-z0-9]+]], #0
-; CHECK-THUMB: movge.w [[CMP]], #1
-; CHECK-THUMB: cmp.w [[CMP]], #0
-; CHECK-THUMB: mov [[OUT_HI:[a-z0-9]+]], r3
-; CHECK-THUMB: movne [[OUT_HI]], [[REG2]]
-; CHECK-THUMB: movne [[OUT_LO]], [[REG1]]
-; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[OUT_LO]], [[OUT_HI]]
-; CHECK-THUMB: cmp
-; CHECK-THUMB: bne
-; CHECK-THUMB: dmb {{ish$}}
-
-; CHECK-M: __atomic_compare_exchange_8
+; CHECK-LE-LABEL: test10:
+; CHECK-LE: @ %bb.0:
+; CHECK-LE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: LBB9_1: @ %atomicrmw.start
+; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-LE-NEXT: ldrexd r4, r5, [r0]
+; CHECK-LE-NEXT: mov r7, r2
+; CHECK-LE-NEXT: subs r3, r1, r4
+; CHECK-LE-NEXT: sbcs r3, r2, r5
+; CHECK-LE-NEXT: mov r3, #0
+; CHECK-LE-NEXT: movwge r3, #1
+; CHECK-LE-NEXT: cmp r3, #0
+; CHECK-LE-NEXT: movne r7, r5
+; CHECK-LE-NEXT: mov r6, r1
+; CHECK-LE-NEXT: movne r6, r4
+; CHECK-LE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-LE-NEXT: cmp r3, #0
+; CHECK-LE-NEXT: bne LBB9_1
+; CHECK-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-LE-NEXT: mov r0, r4
+; CHECK-LE-NEXT: mov r1, r5
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: pop {r4, r5, r6, r7, pc}
+;
+; CHECK-THUMB-LE-LABEL: test10:
+; CHECK-THUMB-LE: @ %bb.0:
+; CHECK-THUMB-LE-NEXT: .save {r4, r5, r7, lr}
+; CHECK-THUMB-LE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-LE-NEXT: mov r12, r0
+; CHECK-THUMB-LE-NEXT: dmb ish
+; CHECK-THUMB-LE-NEXT: .LBB9_1: @ %atomicrmw.start
+; CHECK-THUMB-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-LE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-THUMB-LE-NEXT: mov r4, r2
+; CHECK-THUMB-LE-NEXT: subs.w lr, r2, r0
+; CHECK-THUMB-LE-NEXT: sbcs.w lr, r3, r1
+; CHECK-THUMB-LE-NEXT: mov.w lr, #0
+; CHECK-THUMB-LE-NEXT: it ge
+; CHECK-THUMB-LE-NEXT: movge.w lr, #1
+; CHECK-THUMB-LE-NEXT: cmp.w lr, #0
+; CHECK-THUMB-LE-NEXT: mov lr, r3
+; CHECK-THUMB-LE-NEXT: itt ne
+; CHECK-THUMB-LE-NEXT: movne lr, r1
+; CHECK-THUMB-LE-NEXT: movne r4, r0
+; CHECK-THUMB-LE-NEXT: strexd r5, r4, lr, [r12]
+; CHECK-THUMB-LE-NEXT: cmp r5, #0
+; CHECK-THUMB-LE-NEXT: bne .LBB9_1
+; CHECK-THUMB-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-LE-NEXT: dmb ish
+; CHECK-THUMB-LE-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-BE-LABEL: test10:
+; CHECK-BE: @ %bb.0:
+; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: .LBB9_1: @ %atomicrmw.start
+; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: ldrexd r4, r5, [r0]
+; CHECK-BE-NEXT: mov r7, r2
+; CHECK-BE-NEXT: subs r3, r2, r5
+; CHECK-BE-NEXT: sbcs r3, r1, r4
+; CHECK-BE-NEXT: mov r3, #0
+; CHECK-BE-NEXT: movwge r3, #1
+; CHECK-BE-NEXT: cmp r3, #0
+; CHECK-BE-NEXT: movne r7, r5
+; CHECK-BE-NEXT: mov r6, r1
+; CHECK-BE-NEXT: movne r6, r4
+; CHECK-BE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-BE-NEXT: cmp r3, #0
+; CHECK-BE-NEXT: bne .LBB9_1
+; CHECK-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-BE-NEXT: mov r0, r4
+; CHECK-BE-NEXT: mov r1, r5
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc}
+;
+; CHECK-THUMB-BE-LABEL: test10:
+; CHECK-THUMB-BE: @ %bb.0:
+; CHECK-THUMB-BE-NEXT: .save {r4, r5, r7, lr}
+; CHECK-THUMB-BE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-BE-NEXT: mov r12, r0
+; CHECK-THUMB-BE-NEXT: dmb ish
+; CHECK-THUMB-BE-NEXT: .LBB9_1: @ %atomicrmw.start
+; CHECK-THUMB-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-BE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-THUMB-BE-NEXT: mov r4, r2
+; CHECK-THUMB-BE-NEXT: subs.w lr, r3, r1
+; CHECK-THUMB-BE-NEXT: sbcs.w lr, r2, r0
+; CHECK-THUMB-BE-NEXT: mov.w lr, #0
+; CHECK-THUMB-BE-NEXT: it ge
+; CHECK-THUMB-BE-NEXT: movge.w lr, #1
+; CHECK-THUMB-BE-NEXT: cmp.w lr, #0
+; CHECK-THUMB-BE-NEXT: mov lr, r3
+; CHECK-THUMB-BE-NEXT: itt ne
+; CHECK-THUMB-BE-NEXT: movne lr, r1
+; CHECK-THUMB-BE-NEXT: movne r4, r0
+; CHECK-THUMB-BE-NEXT: strexd r5, r4, lr, [r12]
+; CHECK-THUMB-BE-NEXT: cmp r5, #0
+; CHECK-THUMB-BE-NEXT: bne .LBB9_1
+; CHECK-THUMB-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-BE-NEXT: dmb ish
+; CHECK-THUMB-BE-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-7M-LABEL: test10:
+; CHECK-7M: @ %bb.0:
+; CHECK-7M-NEXT: .save {r4, r5, r6, r7, r8, lr}
+; CHECK-7M-NEXT: push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-7M-NEXT: .pad #16
+; CHECK-7M-NEXT: sub sp, #16
+; CHECK-7M-NEXT: mov r4, r3
+; CHECK-7M-NEXT: mov r5, r2
+; CHECK-7M-NEXT: ldrd r2, r3, [r0]
+; CHECK-7M-NEXT: add.w r8, sp, #8
+; CHECK-7M-NEXT: mov r6, r0
+; CHECK-7M-NEXT: movs r7, #5
+; CHECK-7M-NEXT: .LBB9_1: @ %atomicrmw.start
+; CHECK-7M-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-7M-NEXT: subs r0, r5, r2
+; CHECK-7M-NEXT: strd r2, r3, [sp, #8]
+; CHECK-7M-NEXT: sbcs.w r0, r4, r3
+; CHECK-7M-NEXT: strd r7, r7, [sp]
+; CHECK-7M-NEXT: mov.w r0, #0
+; CHECK-7M-NEXT: mov r1, r8
+; CHECK-7M-NEXT: it ge
+; CHECK-7M-NEXT: movge r0, #1
+; CHECK-7M-NEXT: cmp r0, #0
+; CHECK-7M-NEXT: mov r0, r6
+; CHECK-7M-NEXT: itt eq
+; CHECK-7M-NEXT: moveq r2, r5
+; CHECK-7M-NEXT: moveq r3, r4
+; CHECK-7M-NEXT: bl __atomic_compare_exchange_8
+; CHECK-7M-NEXT: ldrd r2, r3, [sp, #8]
+; CHECK-7M-NEXT: cmp r0, #0
+; CHECK-7M-NEXT: beq .LBB9_1
+; CHECK-7M-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-7M-NEXT: mov r0, r2
+; CHECK-7M-NEXT: mov r1, r3
+; CHECK-7M-NEXT: add sp, #16
+; CHECK-7M-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+;
+; CHECK-8M-LABEL: test10:
+; CHECK-8M: @ %bb.0:
+; CHECK-8M-NEXT: .save {r4, r5, r6, r7, r8, lr}
+; CHECK-8M-NEXT: push {r4, r5, r6, r7, r8, lr}
+; CHECK-8M-NEXT: .pad #16
+; CHECK-8M-NEXT: sub sp, sp, #16
+; CHECK-8M-NEXT: add r8, sp, #8
+; CHECK-8M-NEXT: mov r4, r3
+; CHECK-8M-NEXT: mov r5, r2
+; CHECK-8M-NEXT: mov r6, r0
+; CHECK-8M-NEXT: ldm r0, {r2, r3}
+; CHECK-8M-NEXT: mov r7, #5
+; CHECK-8M-NEXT: .LBB9_1: @ %atomicrmw.start
+; CHECK-8M-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-8M-NEXT: subs r0, r5, r2
+; CHECK-8M-NEXT: str r2, [sp, #8]
+; CHECK-8M-NEXT: sbcs r0, r4, r3
+; CHECK-8M-NEXT: str r3, [sp, #12]
+; CHECK-8M-NEXT: mov r0, #0
+; CHECK-8M-NEXT: mov r1, r8
+; CHECK-8M-NEXT: movge r0, #1
+; CHECK-8M-NEXT: cmp r0, #0
+; CHECK-8M-NEXT: moveq r2, r5
+; CHECK-8M-NEXT: moveq r3, r4
+; CHECK-8M-NEXT: mov r0, r6
+; CHECK-8M-NEXT: str r7, [sp]
+; CHECK-8M-NEXT: str r7, [sp, #4]
+; CHECK-8M-NEXT: bl __atomic_compare_exchange_8
+; CHECK-8M-NEXT: ldr r2, [sp, #8]
+; CHECK-8M-NEXT: cmp r0, #0
+; CHECK-8M-NEXT: ldr r3, [sp, #12]
+; CHECK-8M-NEXT: beq .LBB9_1
+; CHECK-8M-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-8M-NEXT: mov r0, r2
+; CHECK-8M-NEXT: mov r1, r3
+; CHECK-8M-NEXT: add sp, sp, #16
+; CHECK-8M-NEXT: pop {r4, r5, r6, r7, r8, lr}
+; CHECK-8M-NEXT: mov pc, lr
+
+
%r = atomicrmw min ptr %ptr, i64 %val seq_cst
ret i64 %r
}
define i64 @test11(ptr %ptr, i64 %val) {
-; CHECK-LABEL: test11:
-; CHECK: dmb {{ish$}}
-; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: mov [[OUT_HI:[a-z0-9]+]], r2
-; CHECK-LE: subs {{[^,]+}}, r1, [[REG1]]
-; CHECK-BE: subs {{[^,]+}}, r2, [[REG2]]
-; CHECK-LE: sbcs {{[^,]+}}, r2, [[REG2]]
-; CHECK-BE: sbcs {{[^,]+}}, r1, [[REG1]]
-; CHECK: mov [[CMP:[a-z0-9]+]], #0
-; CHECK: movwhs [[CMP]], #1
-; CHECK: cmp [[CMP]], #0
-; CHECK: movne [[OUT_HI]], [[REG2]]
-; CHECK: mov [[OUT_LO:[a-z0-9]+]], r1
-; CHECK: movne [[OUT_LO]], [[REG1]]
-; CHECK: strexd {{[a-z0-9]+}}, [[OUT_LO]], [[OUT_HI]]
-; CHECK: cmp
-; CHECK: bne
-; CHECK: dmb {{ish$}}
-
-; CHECK-THUMB-LABEL: test11:
-; CHECK-THUMB: dmb {{ish$}}
-; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: mov [[OUT_LO:[a-z0-9]+]], r2
-; CHECK-THUMB-LE: subs.w {{[^,]+}}, r2, [[REG1]]
-; CHECK-THUMB-BE: subs.w {{[^,]+}}, r3, [[REG2]]
-; CHECK-THUMB-LE: sbcs.w {{[^,]+}}, r3, [[REG2]]
-; CHECK-THUMB-BE: sbcs.w {{[^,]+}}, r2, [[REG1]]
-; CHECK-THUMB: mov.w [[CMP:[a-z0-9]+]], #0
-; CHECK-THUMB: movhs.w [[CMP]], #1
-; CHECK-THUMB: cmp.w [[CMP]], #0
-; CHECK-THUMB: mov [[OUT_HI:[a-z0-9]+]], r3
-; CHECK-THUMB: movne [[OUT_HI]], [[REG2]]
-; CHECK-THUMB: movne [[OUT_LO]], [[REG1]]
-; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[OUT_LO]], [[OUT_HI]]
-; CHECK-THUMB: cmp
-; CHECK-THUMB: bne
-; CHECK-THUMB: dmb {{ish$}}
-
-; CHECK-M: __atomic_compare_exchange_8
+; CHECK-LE-LABEL: test11:
+; CHECK-LE: @ %bb.0:
+; CHECK-LE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: LBB10_1: @ %atomicrmw.start
+; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-LE-NEXT: ldrexd r4, r5, [r0]
+; CHECK-LE-NEXT: mov r7, r2
+; CHECK-LE-NEXT: subs r3, r1, r4
+; CHECK-LE-NEXT: sbcs r3, r2, r5
+; CHECK-LE-NEXT: mov r3, #0
+; CHECK-LE-NEXT: movwhs r3, #1
+; CHECK-LE-NEXT: cmp r3, #0
+; CHECK-LE-NEXT: movne r7, r5
+; CHECK-LE-NEXT: mov r6, r1
+; CHECK-LE-NEXT: movne r6, r4
+; CHECK-LE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-LE-NEXT: cmp r3, #0
+; CHECK-LE-NEXT: bne LBB10_1
+; CHECK-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-LE-NEXT: mov r0, r4
+; CHECK-LE-NEXT: mov r1, r5
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: pop {r4, r5, r6, r7, pc}
+;
+; CHECK-THUMB-LE-LABEL: test11:
+; CHECK-THUMB-LE: @ %bb.0:
+; CHECK-THUMB-LE-NEXT: .save {r4, r5, r7, lr}
+; CHECK-THUMB-LE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-LE-NEXT: mov r12, r0
+; CHECK-THUMB-LE-NEXT: dmb ish
+; CHECK-THUMB-LE-NEXT: .LBB10_1: @ %atomicrmw.start
+; CHECK-THUMB-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-LE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-THUMB-LE-NEXT: mov r4, r2
+; CHECK-THUMB-LE-NEXT: subs.w lr, r2, r0
+; CHECK-THUMB-LE-NEXT: sbcs.w lr, r3, r1
+; CHECK-THUMB-LE-NEXT: mov.w lr, #0
+; CHECK-THUMB-LE-NEXT: it hs
+; CHECK-THUMB-LE-NEXT: movhs.w lr, #1
+; CHECK-THUMB-LE-NEXT: cmp.w lr, #0
+; CHECK-THUMB-LE-NEXT: mov lr, r3
+; CHECK-THUMB-LE-NEXT: itt ne
+; CHECK-THUMB-LE-NEXT: movne lr, r1
+; CHECK-THUMB-LE-NEXT: movne r4, r0
+; CHECK-THUMB-LE-NEXT: strexd r5, r4, lr, [r12]
+; CHECK-THUMB-LE-NEXT: cmp r5, #0
+; CHECK-THUMB-LE-NEXT: bne .LBB10_1
+; CHECK-THUMB-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-LE-NEXT: dmb ish
+; CHECK-THUMB-LE-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-BE-LABEL: test11:
+; CHECK-BE: @ %bb.0:
+; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: .LBB10_1: @ %atomicrmw.start
+; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: ldrexd r4, r5, [r0]
+; CHECK-BE-NEXT: mov r7, r2
+; CHECK-BE-NEXT: subs r3, r2, r5
+; CHECK-BE-NEXT: sbcs r3, r1, r4
+; CHECK-BE-NEXT: mov r3, #0
+; CHECK-BE-NEXT: movwhs r3, #1
+; CHECK-BE-NEXT: cmp r3, #0
+; CHECK-BE-NEXT: movne r7, r5
+; CHECK-BE-NEXT: mov r6, r1
+; CHECK-BE-NEXT: movne r6, r4
+; CHECK-BE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-BE-NEXT: cmp r3, #0
+; CHECK-BE-NEXT: bne .LBB10_1
+; CHECK-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-BE-NEXT: mov r0, r4
+; CHECK-BE-NEXT: mov r1, r5
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc}
+;
+; CHECK-THUMB-BE-LABEL: test11:
+; CHECK-THUMB-BE: @ %bb.0:
+; CHECK-THUMB-BE-NEXT: .save {r4, r5, r7, lr}
+; CHECK-THUMB-BE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-BE-NEXT: mov r12, r0
+; CHECK-THUMB-BE-NEXT: dmb ish
+; CHECK-THUMB-BE-NEXT: .LBB10_1: @ %atomicrmw.start
+; CHECK-THUMB-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-BE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-THUMB-BE-NEXT: mov r4, r2
+; CHECK-THUMB-BE-NEXT: subs.w lr, r3, r1
+; CHECK-THUMB-BE-NEXT: sbcs.w lr, r2, r0
+; CHECK-THUMB-BE-NEXT: mov.w lr, #0
+; CHECK-THUMB-BE-NEXT: it hs
+; CHECK-THUMB-BE-NEXT: movhs.w lr, #1
+; CHECK-THUMB-BE-NEXT: cmp.w lr, #0
+; CHECK-THUMB-BE-NEXT: mov lr, r3
+; CHECK-THUMB-BE-NEXT: itt ne
+; CHECK-THUMB-BE-NEXT: movne lr, r1
+; CHECK-THUMB-BE-NEXT: movne r4, r0
+; CHECK-THUMB-BE-NEXT: strexd r5, r4, lr, [r12]
+; CHECK-THUMB-BE-NEXT: cmp r5, #0
+; CHECK-THUMB-BE-NEXT: bne .LBB10_1
+; CHECK-THUMB-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-BE-NEXT: dmb ish
+; CHECK-THUMB-BE-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-7M-LABEL: test11:
+; CHECK-7M: @ %bb.0:
+; CHECK-7M-NEXT: .save {r4, r5, r6, r7, r8, lr}
+; CHECK-7M-NEXT: push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-7M-NEXT: .pad #16
+; CHECK-7M-NEXT: sub sp, #16
+; CHECK-7M-NEXT: mov r4, r3
+; CHECK-7M-NEXT: mov r5, r2
+; CHECK-7M-NEXT: ldrd r2, r3, [r0]
+; CHECK-7M-NEXT: add.w r8, sp, #8
+; CHECK-7M-NEXT: mov r6, r0
+; CHECK-7M-NEXT: movs r7, #5
+; CHECK-7M-NEXT: .LBB10_1: @ %atomicrmw.start
+; CHECK-7M-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-7M-NEXT: subs r0, r5, r2
+; CHECK-7M-NEXT: strd r2, r3, [sp, #8]
+; CHECK-7M-NEXT: sbcs.w r0, r4, r3
+; CHECK-7M-NEXT: strd r7, r7, [sp]
+; CHECK-7M-NEXT: mov.w r0, #0
+; CHECK-7M-NEXT: mov r1, r8
+; CHECK-7M-NEXT: it hs
+; CHECK-7M-NEXT: movhs r0, #1
+; CHECK-7M-NEXT: cmp r0, #0
+; CHECK-7M-NEXT: mov r0, r6
+; CHECK-7M-NEXT: itt eq
+; CHECK-7M-NEXT: moveq r2, r5
+; CHECK-7M-NEXT: moveq r3, r4
+; CHECK-7M-NEXT: bl __atomic_compare_exchange_8
+; CHECK-7M-NEXT: ldrd r2, r3, [sp, #8]
+; CHECK-7M-NEXT: cmp r0, #0
+; CHECK-7M-NEXT: beq .LBB10_1
+; CHECK-7M-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-7M-NEXT: mov r0, r2
+; CHECK-7M-NEXT: mov r1, r3
+; CHECK-7M-NEXT: add sp, #16
+; CHECK-7M-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+;
+; CHECK-8M-LABEL: test11:
+; CHECK-8M: @ %bb.0:
+; CHECK-8M-NEXT: .save {r4, r5, r6, r7, r8, lr}
+; CHECK-8M-NEXT: push {r4, r5, r6, r7, r8, lr}
+; CHECK-8M-NEXT: .pad #16
+; CHECK-8M-NEXT: sub sp, sp, #16
+; CHECK-8M-NEXT: add r8, sp, #8
+; CHECK-8M-NEXT: mov r4, r3
+; CHECK-8M-NEXT: mov r5, r2
+; CHECK-8M-NEXT: mov r6, r0
+; CHECK-8M-NEXT: ldm r0, {r2, r3}
+; CHECK-8M-NEXT: mov r7, #5
+; CHECK-8M-NEXT: .LBB10_1: @ %atomicrmw.start
+; CHECK-8M-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-8M-NEXT: subs r0, r5, r2
+; CHECK-8M-NEXT: str r2, [sp, #8]
+; CHECK-8M-NEXT: sbcs r0, r4, r3
+; CHECK-8M-NEXT: str r3, [sp, #12]
+; CHECK-8M-NEXT: mov r0, #0
+; CHECK-8M-NEXT: mov r1, r8
+; CHECK-8M-NEXT: movhs r0, #1
+; CHECK-8M-NEXT: cmp r0, #0
+; CHECK-8M-NEXT: moveq r2, r5
+; CHECK-8M-NEXT: moveq r3, r4
+; CHECK-8M-NEXT: mov r0, r6
+; CHECK-8M-NEXT: str r7, [sp]
+; CHECK-8M-NEXT: str r7, [sp, #4]
+; CHECK-8M-NEXT: bl __atomic_compare_exchange_8
+; CHECK-8M-NEXT: ldr r2, [sp, #8]
+; CHECK-8M-NEXT: cmp r0, #0
+; CHECK-8M-NEXT: ldr r3, [sp, #12]
+; CHECK-8M-NEXT: beq .LBB10_1
+; CHECK-8M-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-8M-NEXT: mov r0, r2
+; CHECK-8M-NEXT: mov r1, r3
+; CHECK-8M-NEXT: add sp, sp, #16
+; CHECK-8M-NEXT: pop {r4, r5, r6, r7, r8, lr}
+; CHECK-8M-NEXT: mov pc, lr
+
+
%r = atomicrmw umin ptr %ptr, i64 %val seq_cst
ret i64 %r
}
define i64 @test12(ptr %ptr, i64 %val) {
-; CHECK-LABEL: test12:
-; CHECK: dmb {{ish$}}
-; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: mov [[OUT_HI:[a-z0-9]+]], r2
-; CHECK-LE: subs {{[^,]+}}, r1, [[REG1]]
-; CHECK-BE: subs {{[^,]+}}, r2, [[REG2]]
-; CHECK-LE: sbcs {{[^,]+}}, r2, [[REG2]]
-; CHECK-BE: sbcs {{[^,]+}}, r1, [[REG1]]
-; CHECK: mov [[CMP:[a-z0-9]+]], #0
-; CHECK: movwlt [[CMP]], #1
-; CHECK: cmp [[CMP]], #0
-; CHECK: movne [[OUT_HI]], [[REG2]]
-; CHECK: mov [[OUT_LO:[a-z0-9]+]], r1
-; CHECK: movne [[OUT_LO]], [[REG1]]
-; CHECK: strexd {{[a-z0-9]+}}, [[OUT_LO]], [[OUT_HI]]
-; CHECK: cmp
-; CHECK: bne
-; CHECK: dmb {{ish$}}
-
-; CHECK-THUMB-LABEL: test12:
-; CHECK-THUMB: dmb {{ish$}}
-; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: mov [[OUT_LO:[a-z0-9]+]], r2
-; CHECK-THUMB-LE: subs.w {{[^,]+}}, r2, [[REG1]]
-; CHECK-THUMB-BE: subs.w {{[^,]+}}, r3, [[REG2]]
-; CHECK-THUMB-LE: sbcs.w {{[^,]+}}, r3, [[REG2]]
-; CHECK-THUMB-BE: sbcs.w {{[^,]+}}, r2, [[REG1]]
-; CHECK-THUMB: mov.w [[CMP:[a-z0-9]+]], #0
-; CHECK-THUMB: movlt.w [[CMP]], #1
-; CHECK-THUMB: cmp.w [[CMP]], #0
-; CHECK-THUMB: mov [[OUT_HI:[a-z0-9]+]], r3
-; CHECK-THUMB: movne [[OUT_HI]], [[REG2]]
-; CHECK-THUMB: movne [[OUT_LO]], [[REG1]]
-; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[OUT_LO]], [[OUT_HI]]
-; CHECK-THUMB: cmp
-; CHECK-THUMB: bne
-; CHECK-THUMB: dmb {{ish$}}
-
-; CHECK-M: __atomic_compare_exchange_8
+; CHECK-LE-LABEL: test12:
+; CHECK-LE: @ %bb.0:
+; CHECK-LE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: LBB11_1: @ %atomicrmw.start
+; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-LE-NEXT: ldrexd r4, r5, [r0]
+; CHECK-LE-NEXT: mov r7, r2
+; CHECK-LE-NEXT: subs r3, r1, r4
+; CHECK-LE-NEXT: sbcs r3, r2, r5
+; CHECK-LE-NEXT: mov r3, #0
+; CHECK-LE-NEXT: movwlt r3, #1
+; CHECK-LE-NEXT: cmp r3, #0
+; CHECK-LE-NEXT: movne r7, r5
+; CHECK-LE-NEXT: mov r6, r1
+; CHECK-LE-NEXT: movne r6, r4
+; CHECK-LE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-LE-NEXT: cmp r3, #0
+; CHECK-LE-NEXT: bne LBB11_1
+; CHECK-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-LE-NEXT: mov r0, r4
+; CHECK-LE-NEXT: mov r1, r5
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: pop {r4, r5, r6, r7, pc}
+;
+; CHECK-THUMB-LE-LABEL: test12:
+; CHECK-THUMB-LE: @ %bb.0:
+; CHECK-THUMB-LE-NEXT: .save {r4, r5, r7, lr}
+; CHECK-THUMB-LE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-LE-NEXT: mov r12, r0
+; CHECK-THUMB-LE-NEXT: dmb ish
+; CHECK-THUMB-LE-NEXT: .LBB11_1: @ %atomicrmw.start
+; CHECK-THUMB-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-LE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-THUMB-LE-NEXT: mov r4, r2
+; CHECK-THUMB-LE-NEXT: subs.w lr, r2, r0
+; CHECK-THUMB-LE-NEXT: sbcs.w lr, r3, r1
+; CHECK-THUMB-LE-NEXT: mov.w lr, #0
+; CHECK-THUMB-LE-NEXT: it lt
+; CHECK-THUMB-LE-NEXT: movlt.w lr, #1
+; CHECK-THUMB-LE-NEXT: cmp.w lr, #0
+; CHECK-THUMB-LE-NEXT: mov lr, r3
+; CHECK-THUMB-LE-NEXT: itt ne
+; CHECK-THUMB-LE-NEXT: movne lr, r1
+; CHECK-THUMB-LE-NEXT: movne r4, r0
+; CHECK-THUMB-LE-NEXT: strexd r5, r4, lr, [r12]
+; CHECK-THUMB-LE-NEXT: cmp r5, #0
+; CHECK-THUMB-LE-NEXT: bne .LBB11_1
+; CHECK-THUMB-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-LE-NEXT: dmb ish
+; CHECK-THUMB-LE-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-BE-LABEL: test12:
+; CHECK-BE: @ %bb.0:
+; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: .LBB11_1: @ %atomicrmw.start
+; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: ldrexd r4, r5, [r0]
+; CHECK-BE-NEXT: mov r7, r2
+; CHECK-BE-NEXT: subs r3, r2, r5
+; CHECK-BE-NEXT: sbcs r3, r1, r4
+; CHECK-BE-NEXT: mov r3, #0
+; CHECK-BE-NEXT: movwlt r3, #1
+; CHECK-BE-NEXT: cmp r3, #0
+; CHECK-BE-NEXT: movne r7, r5
+; CHECK-BE-NEXT: mov r6, r1
+; CHECK-BE-NEXT: movne r6, r4
+; CHECK-BE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-BE-NEXT: cmp r3, #0
+; CHECK-BE-NEXT: bne .LBB11_1
+; CHECK-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-BE-NEXT: mov r0, r4
+; CHECK-BE-NEXT: mov r1, r5
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc}
+;
+; CHECK-THUMB-BE-LABEL: test12:
+; CHECK-THUMB-BE: @ %bb.0:
+; CHECK-THUMB-BE-NEXT: .save {r4, r5, r7, lr}
+; CHECK-THUMB-BE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-BE-NEXT: mov r12, r0
+; CHECK-THUMB-BE-NEXT: dmb ish
+; CHECK-THUMB-BE-NEXT: .LBB11_1: @ %atomicrmw.start
+; CHECK-THUMB-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-BE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-THUMB-BE-NEXT: mov r4, r2
+; CHECK-THUMB-BE-NEXT: subs.w lr, r3, r1
+; CHECK-THUMB-BE-NEXT: sbcs.w lr, r2, r0
+; CHECK-THUMB-BE-NEXT: mov.w lr, #0
+; CHECK-THUMB-BE-NEXT: it lt
+; CHECK-THUMB-BE-NEXT: movlt.w lr, #1
+; CHECK-THUMB-BE-NEXT: cmp.w lr, #0
+; CHECK-THUMB-BE-NEXT: mov lr, r3
+; CHECK-THUMB-BE-NEXT: itt ne
+; CHECK-THUMB-BE-NEXT: movne lr, r1
+; CHECK-THUMB-BE-NEXT: movne r4, r0
+; CHECK-THUMB-BE-NEXT: strexd r5, r4, lr, [r12]
+; CHECK-THUMB-BE-NEXT: cmp r5, #0
+; CHECK-THUMB-BE-NEXT: bne .LBB11_1
+; CHECK-THUMB-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-BE-NEXT: dmb ish
+; CHECK-THUMB-BE-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-7M-LABEL: test12:
+; CHECK-7M: @ %bb.0:
+; CHECK-7M-NEXT: .save {r4, r5, r6, r7, r8, lr}
+; CHECK-7M-NEXT: push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-7M-NEXT: .pad #16
+; CHECK-7M-NEXT: sub sp, #16
+; CHECK-7M-NEXT: mov r4, r3
+; CHECK-7M-NEXT: mov r5, r2
+; CHECK-7M-NEXT: ldrd r2, r3, [r0]
+; CHECK-7M-NEXT: add.w r8, sp, #8
+; CHECK-7M-NEXT: mov r6, r0
+; CHECK-7M-NEXT: movs r7, #5
+; CHECK-7M-NEXT: .LBB11_1: @ %atomicrmw.start
+; CHECK-7M-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-7M-NEXT: subs r0, r5, r2
+; CHECK-7M-NEXT: strd r2, r3, [sp, #8]
+; CHECK-7M-NEXT: sbcs.w r0, r4, r3
+; CHECK-7M-NEXT: strd r7, r7, [sp]
+; CHECK-7M-NEXT: mov.w r0, #0
+; CHECK-7M-NEXT: mov r1, r8
+; CHECK-7M-NEXT: it lt
+; CHECK-7M-NEXT: movlt r0, #1
+; CHECK-7M-NEXT: cmp r0, #0
+; CHECK-7M-NEXT: mov r0, r6
+; CHECK-7M-NEXT: itt eq
+; CHECK-7M-NEXT: moveq r2, r5
+; CHECK-7M-NEXT: moveq r3, r4
+; CHECK-7M-NEXT: bl __atomic_compare_exchange_8
+; CHECK-7M-NEXT: ldrd r2, r3, [sp, #8]
+; CHECK-7M-NEXT: cmp r0, #0
+; CHECK-7M-NEXT: beq .LBB11_1
+; CHECK-7M-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-7M-NEXT: mov r0, r2
+; CHECK-7M-NEXT: mov r1, r3
+; CHECK-7M-NEXT: add sp, #16
+; CHECK-7M-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+;
+; CHECK-8M-LABEL: test12:
+; CHECK-8M: @ %bb.0:
+; CHECK-8M-NEXT: .save {r4, r5, r6, r7, r8, lr}
+; CHECK-8M-NEXT: push {r4, r5, r6, r7, r8, lr}
+; CHECK-8M-NEXT: .pad #16
+; CHECK-8M-NEXT: sub sp, sp, #16
+; CHECK-8M-NEXT: add r8, sp, #8
+; CHECK-8M-NEXT: mov r4, r3
+; CHECK-8M-NEXT: mov r5, r2
+; CHECK-8M-NEXT: mov r6, r0
+; CHECK-8M-NEXT: ldm r0, {r2, r3}
+; CHECK-8M-NEXT: mov r7, #5
+; CHECK-8M-NEXT: .LBB11_1: @ %atomicrmw.start
+; CHECK-8M-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-8M-NEXT: subs r0, r5, r2
+; CHECK-8M-NEXT: str r2, [sp, #8]
+; CHECK-8M-NEXT: sbcs r0, r4, r3
+; CHECK-8M-NEXT: str r3, [sp, #12]
+; CHECK-8M-NEXT: mov r0, #0
+; CHECK-8M-NEXT: mov r1, r8
+; CHECK-8M-NEXT: movlt r0, #1
+; CHECK-8M-NEXT: cmp r0, #0
+; CHECK-8M-NEXT: moveq r2, r5
+; CHECK-8M-NEXT: moveq r3, r4
+; CHECK-8M-NEXT: mov r0, r6
+; CHECK-8M-NEXT: str r7, [sp]
+; CHECK-8M-NEXT: str r7, [sp, #4]
+; CHECK-8M-NEXT: bl __atomic_compare_exchange_8
+; CHECK-8M-NEXT: ldr r2, [sp, #8]
+; CHECK-8M-NEXT: cmp r0, #0
+; CHECK-8M-NEXT: ldr r3, [sp, #12]
+; CHECK-8M-NEXT: beq .LBB11_1
+; CHECK-8M-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-8M-NEXT: mov r0, r2
+; CHECK-8M-NEXT: mov r1, r3
+; CHECK-8M-NEXT: add sp, sp, #16
+; CHECK-8M-NEXT: pop {r4, r5, r6, r7, r8, lr}
+; CHECK-8M-NEXT: mov pc, lr
+
+
%r = atomicrmw max ptr %ptr, i64 %val seq_cst
ret i64 %r
}
define i64 @test13(ptr %ptr, i64 %val) {
-; CHECK-LABEL: test13:
-; CHECK: dmb {{ish$}}
-; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: mov [[OUT_HI:[a-z0-9]+]], r2
-; CHECK-LE: subs {{[^,]+}}, r1, [[REG1]]
-; CHECK-BE: subs {{[^,]+}}, r2, [[REG2]]
-; CHECK-LE: sbcs {{[^,]+}}, r2, [[REG2]]
-; CHECK-BE: sbcs {{[^,]+}}, r1, [[REG1]]
-; CHECK: mov [[CMP:[a-z0-9]+]], #0
-; CHECK: movwlo [[CMP]], #1
-; CHECK: cmp [[CMP]], #0
-; CHECK: movne [[OUT_HI]], [[REG2]]
-; CHECK: mov [[OUT_LO:[a-z0-9]+]], r1
-; CHECK: movne [[OUT_LO]], [[REG1]]
-; CHECK: strexd {{[a-z0-9]+}}, [[OUT_LO]], [[OUT_HI]]
-; CHECK: cmp
-; CHECK: bne
-; CHECK: dmb {{ish$}}
-
-; CHECK-THUMB-LABEL: test13:
-; CHECK-THUMB: dmb {{ish$}}
-; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: mov [[OUT_LO:[a-z0-9]+]], r2
-; CHECK-THUMB-LE: subs.w {{[^,]+}}, r2, [[REG1]]
-; CHECK-THUMB-BE: subs.w {{[^,]+}}, r3, [[REG2]]
-; CHECK-THUMB-LE: sbcs.w {{[^,]+}}, r3, [[REG2]]
-; CHECK-THUMB-BE: sbcs.w {{[^,]+}}, r2, [[REG1]]
-; CHECK-THUMB: mov.w [[CMP:[a-z0-9]+]], #0
-; CHECK-THUMB: movlo.w [[CMP]], #1
-; CHECK-THUMB: cmp.w [[CMP]], #0
-; CHECK-THUMB: mov [[OUT_HI:[a-z0-9]+]], r3
-; CHECK-THUMB: movne [[OUT_HI]], [[REG2]]
-; CHECK-THUMB: movne [[OUT_LO]], [[REG1]]
-; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[OUT_LO]], [[OUT_HI]]
-; CHECK-THUMB: cmp
-; CHECK-THUMB: bne
-; CHECK-THUMB: dmb {{ish$}}
-
-; CHECK-M: __atomic_compare_exchange_8
+; CHECK-LE-LABEL: test13:
+; CHECK-LE: @ %bb.0:
+; CHECK-LE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: LBB12_1: @ %atomicrmw.start
+; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-LE-NEXT: ldrexd r4, r5, [r0]
+; CHECK-LE-NEXT: mov r7, r2
+; CHECK-LE-NEXT: subs r3, r1, r4
+; CHECK-LE-NEXT: sbcs r3, r2, r5
+; CHECK-LE-NEXT: mov r3, #0
+; CHECK-LE-NEXT: movwlo r3, #1
+; CHECK-LE-NEXT: cmp r3, #0
+; CHECK-LE-NEXT: movne r7, r5
+; CHECK-LE-NEXT: mov r6, r1
+; CHECK-LE-NEXT: movne r6, r4
+; CHECK-LE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-LE-NEXT: cmp r3, #0
+; CHECK-LE-NEXT: bne LBB12_1
+; CHECK-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-LE-NEXT: mov r0, r4
+; CHECK-LE-NEXT: mov r1, r5
+; CHECK-LE-NEXT: dmb ish
+; CHECK-LE-NEXT: pop {r4, r5, r6, r7, pc}
+;
+; CHECK-THUMB-LE-LABEL: test13:
+; CHECK-THUMB-LE: @ %bb.0:
+; CHECK-THUMB-LE-NEXT: .save {r4, r5, r7, lr}
+; CHECK-THUMB-LE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-LE-NEXT: mov r12, r0
+; CHECK-THUMB-LE-NEXT: dmb ish
+; CHECK-THUMB-LE-NEXT: .LBB12_1: @ %atomicrmw.start
+; CHECK-THUMB-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-LE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-THUMB-LE-NEXT: mov r4, r2
+; CHECK-THUMB-LE-NEXT: subs.w lr, r2, r0
+; CHECK-THUMB-LE-NEXT: sbcs.w lr, r3, r1
+; CHECK-THUMB-LE-NEXT: mov.w lr, #0
+; CHECK-THUMB-LE-NEXT: it lo
+; CHECK-THUMB-LE-NEXT: movlo.w lr, #1
+; CHECK-THUMB-LE-NEXT: cmp.w lr, #0
+; CHECK-THUMB-LE-NEXT: mov lr, r3
+; CHECK-THUMB-LE-NEXT: itt ne
+; CHECK-THUMB-LE-NEXT: movne lr, r1
+; CHECK-THUMB-LE-NEXT: movne r4, r0
+; CHECK-THUMB-LE-NEXT: strexd r5, r4, lr, [r12]
+; CHECK-THUMB-LE-NEXT: cmp r5, #0
+; CHECK-THUMB-LE-NEXT: bne .LBB12_1
+; CHECK-THUMB-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-LE-NEXT: dmb ish
+; CHECK-THUMB-LE-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-BE-LABEL: test13:
+; CHECK-BE: @ %bb.0:
+; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: .LBB12_1: @ %atomicrmw.start
+; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: ldrexd r4, r5, [r0]
+; CHECK-BE-NEXT: mov r7, r2
+; CHECK-BE-NEXT: subs r3, r2, r5
+; CHECK-BE-NEXT: sbcs r3, r1, r4
+; CHECK-BE-NEXT: mov r3, #0
+; CHECK-BE-NEXT: movwlo r3, #1
+; CHECK-BE-NEXT: cmp r3, #0
+; CHECK-BE-NEXT: movne r7, r5
+; CHECK-BE-NEXT: mov r6, r1
+; CHECK-BE-NEXT: movne r6, r4
+; CHECK-BE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-BE-NEXT: cmp r3, #0
+; CHECK-BE-NEXT: bne .LBB12_1
+; CHECK-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-BE-NEXT: mov r0, r4
+; CHECK-BE-NEXT: mov r1, r5
+; CHECK-BE-NEXT: dmb ish
+; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc}
+;
+; CHECK-THUMB-BE-LABEL: test13:
+; CHECK-THUMB-BE: @ %bb.0:
+; CHECK-THUMB-BE-NEXT: .save {r4, r5, r7, lr}
+; CHECK-THUMB-BE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-BE-NEXT: mov r12, r0
+; CHECK-THUMB-BE-NEXT: dmb ish
+; CHECK-THUMB-BE-NEXT: .LBB12_1: @ %atomicrmw.start
+; CHECK-THUMB-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-BE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-THUMB-BE-NEXT: mov r4, r2
+; CHECK-THUMB-BE-NEXT: subs.w lr, r3, r1
+; CHECK-THUMB-BE-NEXT: sbcs.w lr, r2, r0
+; CHECK-THUMB-BE-NEXT: mov.w lr, #0
+; CHECK-THUMB-BE-NEXT: it lo
+; CHECK-THUMB-BE-NEXT: movlo.w lr, #1
+; CHECK-THUMB-BE-NEXT: cmp.w lr, #0
+; CHECK-THUMB-BE-NEXT: mov lr, r3
+; CHECK-THUMB-BE-NEXT: itt ne
+; CHECK-THUMB-BE-NEXT: movne lr, r1
+; CHECK-THUMB-BE-NEXT: movne r4, r0
+; CHECK-THUMB-BE-NEXT: strexd r5, r4, lr, [r12]
+; CHECK-THUMB-BE-NEXT: cmp r5, #0
+; CHECK-THUMB-BE-NEXT: bne .LBB12_1
+; CHECK-THUMB-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-BE-NEXT: dmb ish
+; CHECK-THUMB-BE-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-7M-LABEL: test13:
+; CHECK-7M: @ %bb.0:
+; CHECK-7M-NEXT: .save {r4, r5, r6, r7, r8, lr}
+; CHECK-7M-NEXT: push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-7M-NEXT: .pad #16
+; CHECK-7M-NEXT: sub sp, #16
+; CHECK-7M-NEXT: mov r4, r3
+; CHECK-7M-NEXT: mov r5, r2
+; CHECK-7M-NEXT: ldrd r2, r3, [r0]
+; CHECK-7M-NEXT: add.w r8, sp, #8
+; CHECK-7M-NEXT: mov r6, r0
+; CHECK-7M-NEXT: movs r7, #5
+; CHECK-7M-NEXT: .LBB12_1: @ %atomicrmw.start
+; CHECK-7M-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-7M-NEXT: subs r0, r5, r2
+; CHECK-7M-NEXT: strd r2, r3, [sp, #8]
+; CHECK-7M-NEXT: sbcs.w r0, r4, r3
+; CHECK-7M-NEXT: strd r7, r7, [sp]
+; CHECK-7M-NEXT: mov.w r0, #0
+; CHECK-7M-NEXT: mov r1, r8
+; CHECK-7M-NEXT: it lo
+; CHECK-7M-NEXT: movlo r0, #1
+; CHECK-7M-NEXT: cmp r0, #0
+; CHECK-7M-NEXT: mov r0, r6
+; CHECK-7M-NEXT: itt eq
+; CHECK-7M-NEXT: moveq r2, r5
+; CHECK-7M-NEXT: moveq r3, r4
+; CHECK-7M-NEXT: bl __atomic_compare_exchange_8
+; CHECK-7M-NEXT: ldrd r2, r3, [sp, #8]
+; CHECK-7M-NEXT: cmp r0, #0
+; CHECK-7M-NEXT: beq .LBB12_1
+; CHECK-7M-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-7M-NEXT: mov r0, r2
+; CHECK-7M-NEXT: mov r1, r3
+; CHECK-7M-NEXT: add sp, #16
+; CHECK-7M-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+;
+; CHECK-8M-LABEL: test13:
+; CHECK-8M: @ %bb.0:
+; CHECK-8M-NEXT: .save {r4, r5, r6, r7, r8, lr}
+; CHECK-8M-NEXT: push {r4, r5, r6, r7, r8, lr}
+; CHECK-8M-NEXT: .pad #16
+; CHECK-8M-NEXT: sub sp, sp, #16
+; CHECK-8M-NEXT: add r8, sp, #8
+; CHECK-8M-NEXT: mov r4, r3
+; CHECK-8M-NEXT: mov r5, r2
+; CHECK-8M-NEXT: mov r6, r0
+; CHECK-8M-NEXT: ldm r0, {r2, r3}
+; CHECK-8M-NEXT: mov r7, #5
+; CHECK-8M-NEXT: .LBB12_1: @ %atomicrmw.start
+; CHECK-8M-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-8M-NEXT: subs r0, r5, r2
+; CHECK-8M-NEXT: str r2, [sp, #8]
+; CHECK-8M-NEXT: sbcs r0, r4, r3
+; CHECK-8M-NEXT: str r3, [sp, #12]
+; CHECK-8M-NEXT: mov r0, #0
+; CHECK-8M-NEXT: mov r1, r8
+; CHECK-8M-NEXT: movlo r0, #1
+; CHECK-8M-NEXT: cmp r0, #0
+; CHECK-8M-NEXT: moveq r2, r5
+; CHECK-8M-NEXT: moveq r3, r4
+; CHECK-8M-NEXT: mov r0, r6
+; CHECK-8M-NEXT: str r7, [sp]
+; CHECK-8M-NEXT: str r7, [sp, #4]
+; CHECK-8M-NEXT: bl __atomic_compare_exchange_8
+; CHECK-8M-NEXT: ldr r2, [sp, #8]
+; CHECK-8M-NEXT: cmp r0, #0
+; CHECK-8M-NEXT: ldr r3, [sp, #12]
+; CHECK-8M-NEXT: beq .LBB12_1
+; CHECK-8M-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-8M-NEXT: mov r0, r2
+; CHECK-8M-NEXT: mov r1, r3
+; CHECK-8M-NEXT: add sp, sp, #16
+; CHECK-8M-NEXT: pop {r4, r5, r6, r7, r8, lr}
+; CHECK-8M-NEXT: mov pc, lr
+
+
%r = atomicrmw umax ptr %ptr, i64 %val seq_cst
ret i64 %r
diff --git a/llvm/test/CodeGen/ARM/atomic-ops-v8.ll b/llvm/test/CodeGen/ARM/atomic-ops-v8.ll
index 0a467c2b70acf2..c7d4cf5912d179 100644
--- a/llvm/test/CodeGen/ARM/atomic-ops-v8.ll
+++ b/llvm/test/CodeGen/ARM/atomic-ops-v8.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=armv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE --check-prefix=CHECK-ARM --check-prefix=CHECK-ARM-LE
; RUN: llc -mtriple=armebv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE --check-prefix=CHECK-ARM --check-prefix=CHECK-ARM-BE
; RUN: llc -mtriple=thumbv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE --check-prefix=CHECK-THUMB --check-prefix=CHECK-THUMB-LE
@@ -9,1399 +10,2488 @@
@var64 = global i64 0
define i8 @test_atomic_load_add_i8(i8 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_add_i8:
+; CHECK-ARM-LABEL: test_atomic_load_add_i8:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var8
+; CHECK-ARM-NEXT: movt r12, :upper16:var8
+; CHECK-ARM-NEXT: .LBB0_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldaexb r1, [r12]
+; CHECK-ARM-NEXT: add r3, r1, r0
+; CHECK-ARM-NEXT: stlexb r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB0_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_add_i8:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var8
+; CHECK-THUMB-NEXT: movt r12, :upper16:var8
+; CHECK-THUMB-NEXT: .LBB0_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldaexb r1, [r12]
+; CHECK-THUMB-NEXT: adds r3, r1, r0
+; CHECK-THUMB-NEXT: stlexb r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB0_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw add ptr @var8, i8 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
-; CHECK: movt r[[ADDR]], :upper16:var8
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: add{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
-; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i8 %old
}
define i16 @test_atomic_load_add_i16(i16 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_add_i16:
+; CHECK-ARM-LABEL: test_atomic_load_add_i16:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var16
+; CHECK-ARM-NEXT: movt r12, :upper16:var16
+; CHECK-ARM-NEXT: .LBB1_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldaexh r1, [r12]
+; CHECK-ARM-NEXT: add r3, r1, r0
+; CHECK-ARM-NEXT: strexh r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB1_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_add_i16:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var16
+; CHECK-THUMB-NEXT: movt r12, :upper16:var16
+; CHECK-THUMB-NEXT: .LBB1_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldaexh r1, [r12]
+; CHECK-THUMB-NEXT: adds r3, r1, r0
+; CHECK-THUMB-NEXT: strexh r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB1_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw add ptr @var16, i16 %offset acquire
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
-; CHECK: movt r[[ADDR]], :upper16:var16
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: add{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
-; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i16 %old
}
define i32 @test_atomic_load_add_i32(i32 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_add_i32:
+; CHECK-ARM-LABEL: test_atomic_load_add_i32:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var32
+; CHECK-ARM-NEXT: movt r12, :upper16:var32
+; CHECK-ARM-NEXT: .LBB2_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldrex r1, [r12]
+; CHECK-ARM-NEXT: add r3, r1, r0
+; CHECK-ARM-NEXT: stlex r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB2_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_add_i32:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var32
+; CHECK-THUMB-NEXT: movt r12, :upper16:var32
+; CHECK-THUMB-NEXT: .LBB2_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrex r1, [r12]
+; CHECK-THUMB-NEXT: adds r3, r1, r0
+; CHECK-THUMB-NEXT: stlex r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB2_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw add ptr @var32, i32 %offset release
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
-; CHECK: movt r[[ADDR]], :upper16:var32
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: add{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
-; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i32 %old
}
define void @test_atomic_load_add_i64(i64 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_add_i64:
+; CHECK-ARM-LE-LABEL: test_atomic_load_add_i64:
+; CHECK-ARM-LE: @ %bb.0:
+; CHECK-ARM-LE-NEXT: push {r4, r5, r6, r7, r11, lr}
+; CHECK-ARM-LE-NEXT: movw r2, :lower16:var64
+; CHECK-ARM-LE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-LE-NEXT: .LBB3_1: @ %atomicrmw.start
+; CHECK-ARM-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-LE-NEXT: ldrexd r4, r5, [r2]
+; CHECK-ARM-LE-NEXT: adds r6, r4, r0
+; CHECK-ARM-LE-NEXT: adc r7, r5, r1
+; CHECK-ARM-LE-NEXT: strexd r3, r6, r7, [r2]
+; CHECK-ARM-LE-NEXT: cmp r3, #0
+; CHECK-ARM-LE-NEXT: bne .LBB3_1
+; CHECK-ARM-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-LE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-LE-NEXT: pop {r4, r5, r6, r7, r11, pc}
+;
+; CHECK-ARM-BE-LABEL: test_atomic_load_add_i64:
+; CHECK-ARM-BE: @ %bb.0:
+; CHECK-ARM-BE-NEXT: push {r4, r5, r6, r7, r11, lr}
+; CHECK-ARM-BE-NEXT: movw r2, :lower16:var64
+; CHECK-ARM-BE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-BE-NEXT: .LBB3_1: @ %atomicrmw.start
+; CHECK-ARM-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-BE-NEXT: ldrexd r4, r5, [r2]
+; CHECK-ARM-BE-NEXT: adds r7, r5, r1
+; CHECK-ARM-BE-NEXT: adc r6, r4, r0
+; CHECK-ARM-BE-NEXT: strexd r3, r6, r7, [r2]
+; CHECK-ARM-BE-NEXT: cmp r3, #0
+; CHECK-ARM-BE-NEXT: bne .LBB3_1
+; CHECK-ARM-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-BE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-BE-NEXT: pop {r4, r5, r6, r7, r11, pc}
+;
+; CHECK-THUMB-LE-LABEL: test_atomic_load_add_i64:
+; CHECK-THUMB-LE: @ %bb.0:
+; CHECK-THUMB-LE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-LE-NEXT: movw r12, :lower16:var64
+; CHECK-THUMB-LE-NEXT: movt r12, :upper16:var64
+; CHECK-THUMB-LE-NEXT: .LBB3_1: @ %atomicrmw.start
+; CHECK-THUMB-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-LE-NEXT: ldrexd r3, r2, [r12]
+; CHECK-THUMB-LE-NEXT: adds.w lr, r3, r0
+; CHECK-THUMB-LE-NEXT: adc.w r4, r2, r1
+; CHECK-THUMB-LE-NEXT: strexd r5, lr, r4, [r12]
+; CHECK-THUMB-LE-NEXT: cmp r5, #0
+; CHECK-THUMB-LE-NEXT: bne .LBB3_1
+; CHECK-THUMB-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-LE-NEXT: strd r3, r2, [r12]
+; CHECK-THUMB-LE-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-THUMB-BE-LABEL: test_atomic_load_add_i64:
+; CHECK-THUMB-BE: @ %bb.0:
+; CHECK-THUMB-BE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-BE-NEXT: movw r12, :lower16:var64
+; CHECK-THUMB-BE-NEXT: movt r12, :upper16:var64
+; CHECK-THUMB-BE-NEXT: .LBB3_1: @ %atomicrmw.start
+; CHECK-THUMB-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-BE-NEXT: ldrexd r3, r2, [r12]
+; CHECK-THUMB-BE-NEXT: adds.w lr, r2, r1
+; CHECK-THUMB-BE-NEXT: adc.w r4, r3, r0
+; CHECK-THUMB-BE-NEXT: strexd r5, r4, lr, [r12]
+; CHECK-THUMB-BE-NEXT: cmp r5, #0
+; CHECK-THUMB-BE-NEXT: bne .LBB3_1
+; CHECK-THUMB-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-BE-NEXT: strd r3, r2, [r12]
+; CHECK-THUMB-BE-NEXT: pop {r4, r5, r7, pc}
%old = atomicrmw add ptr @var64, i64 %offset monotonic
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
-; CHECK: movt r[[ADDR]], :upper16:var64
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
; r0, r1 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-LE-NEXT: adds{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
-; CHECK-LE-NEXT: adc{{(\.w)?}} [[NEW2:r[0-9]+]], r[[OLD2]], r1
-; CHECK-BE-NEXT: adds{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
-; CHECK-BE-NEXT: adc{{(\.w)?}} [[NEW1:r[0-9]+]], r[[OLD1]], r0
-; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: strd r[[OLD1]], r[[OLD2]], [r[[ADDR]]]
+
store i64 %old, ptr @var64
ret void
}
define i8 @test_atomic_load_sub_i8(i8 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_sub_i8:
+; CHECK-ARM-LABEL: test_atomic_load_sub_i8:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var8
+; CHECK-ARM-NEXT: movt r12, :upper16:var8
+; CHECK-ARM-NEXT: .LBB4_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldrexb r1, [r12]
+; CHECK-ARM-NEXT: sub r3, r1, r0
+; CHECK-ARM-NEXT: strexb r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB4_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_sub_i8:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var8
+; CHECK-THUMB-NEXT: movt r12, :upper16:var8
+; CHECK-THUMB-NEXT: .LBB4_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrexb r1, [r12]
+; CHECK-THUMB-NEXT: subs r3, r1, r0
+; CHECK-THUMB-NEXT: strexb r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB4_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw sub ptr @var8, i8 %offset monotonic
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
-; CHECK: movt r[[ADDR]], :upper16:var8
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldrexb r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: sub{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
-; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i8 %old
}
define i16 @test_atomic_load_sub_i16(i16 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_sub_i16:
+; CHECK-ARM-LABEL: test_atomic_load_sub_i16:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var16
+; CHECK-ARM-NEXT: movt r12, :upper16:var16
+; CHECK-ARM-NEXT: .LBB5_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldrexh r1, [r12]
+; CHECK-ARM-NEXT: sub r3, r1, r0
+; CHECK-ARM-NEXT: stlexh r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB5_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_sub_i16:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var16
+; CHECK-THUMB-NEXT: movt r12, :upper16:var16
+; CHECK-THUMB-NEXT: .LBB5_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrexh r1, [r12]
+; CHECK-THUMB-NEXT: subs r3, r1, r0
+; CHECK-THUMB-NEXT: stlexh r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB5_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw sub ptr @var16, i16 %offset release
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
-; CHECK: movt r[[ADDR]], :upper16:var16
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: sub{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
-; CHECK-NEXT: stlexh [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i16 %old
}
define i32 @test_atomic_load_sub_i32(i32 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_sub_i32:
+; CHECK-ARM-LABEL: test_atomic_load_sub_i32:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var32
+; CHECK-ARM-NEXT: movt r12, :upper16:var32
+; CHECK-ARM-NEXT: .LBB6_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldaex r1, [r12]
+; CHECK-ARM-NEXT: sub r3, r1, r0
+; CHECK-ARM-NEXT: strex r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB6_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_sub_i32:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var32
+; CHECK-THUMB-NEXT: movt r12, :upper16:var32
+; CHECK-THUMB-NEXT: .LBB6_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldaex r1, [r12]
+; CHECK-THUMB-NEXT: subs r3, r1, r0
+; CHECK-THUMB-NEXT: strex r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB6_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw sub ptr @var32, i32 %offset acquire
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
-; CHECK: movt r[[ADDR]], :upper16:var32
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: sub{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
-; CHECK-NEXT: strex [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i32 %old
}
define void @test_atomic_load_sub_i64(i64 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_sub_i64:
+; CHECK-ARM-LE-LABEL: test_atomic_load_sub_i64:
+; CHECK-ARM-LE: @ %bb.0:
+; CHECK-ARM-LE-NEXT: push {r4, r5, r6, r7, r11, lr}
+; CHECK-ARM-LE-NEXT: movw r2, :lower16:var64
+; CHECK-ARM-LE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-LE-NEXT: .LBB7_1: @ %atomicrmw.start
+; CHECK-ARM-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-LE-NEXT: ldaexd r4, r5, [r2]
+; CHECK-ARM-LE-NEXT: subs r6, r4, r0
+; CHECK-ARM-LE-NEXT: sbc r7, r5, r1
+; CHECK-ARM-LE-NEXT: stlexd r3, r6, r7, [r2]
+; CHECK-ARM-LE-NEXT: cmp r3, #0
+; CHECK-ARM-LE-NEXT: bne .LBB7_1
+; CHECK-ARM-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-LE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-LE-NEXT: pop {r4, r5, r6, r7, r11, pc}
+;
+; CHECK-ARM-BE-LABEL: test_atomic_load_sub_i64:
+; CHECK-ARM-BE: @ %bb.0:
+; CHECK-ARM-BE-NEXT: push {r4, r5, r6, r7, r11, lr}
+; CHECK-ARM-BE-NEXT: movw r2, :lower16:var64
+; CHECK-ARM-BE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-BE-NEXT: .LBB7_1: @ %atomicrmw.start
+; CHECK-ARM-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-BE-NEXT: ldaexd r4, r5, [r2]
+; CHECK-ARM-BE-NEXT: subs r7, r5, r1
+; CHECK-ARM-BE-NEXT: sbc r6, r4, r0
+; CHECK-ARM-BE-NEXT: stlexd r3, r6, r7, [r2]
+; CHECK-ARM-BE-NEXT: cmp r3, #0
+; CHECK-ARM-BE-NEXT: bne .LBB7_1
+; CHECK-ARM-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-BE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-BE-NEXT: pop {r4, r5, r6, r7, r11, pc}
+;
+; CHECK-THUMB-LE-LABEL: test_atomic_load_sub_i64:
+; CHECK-THUMB-LE: @ %bb.0:
+; CHECK-THUMB-LE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-LE-NEXT: movw r12, :lower16:var64
+; CHECK-THUMB-LE-NEXT: movt r12, :upper16:var64
+; CHECK-THUMB-LE-NEXT: .LBB7_1: @ %atomicrmw.start
+; CHECK-THUMB-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-LE-NEXT: ldaexd r3, r2, [r12]
+; CHECK-THUMB-LE-NEXT: subs.w lr, r3, r0
+; CHECK-THUMB-LE-NEXT: sbc.w r4, r2, r1
+; CHECK-THUMB-LE-NEXT: stlexd r5, lr, r4, [r12]
+; CHECK-THUMB-LE-NEXT: cmp r5, #0
+; CHECK-THUMB-LE-NEXT: bne .LBB7_1
+; CHECK-THUMB-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-LE-NEXT: strd r3, r2, [r12]
+; CHECK-THUMB-LE-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-THUMB-BE-LABEL: test_atomic_load_sub_i64:
+; CHECK-THUMB-BE: @ %bb.0:
+; CHECK-THUMB-BE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-BE-NEXT: movw r12, :lower16:var64
+; CHECK-THUMB-BE-NEXT: movt r12, :upper16:var64
+; CHECK-THUMB-BE-NEXT: .LBB7_1: @ %atomicrmw.start
+; CHECK-THUMB-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-BE-NEXT: ldaexd r3, r2, [r12]
+; CHECK-THUMB-BE-NEXT: subs.w lr, r2, r1
+; CHECK-THUMB-BE-NEXT: sbc.w r4, r3, r0
+; CHECK-THUMB-BE-NEXT: stlexd r5, r4, lr, [r12]
+; CHECK-THUMB-BE-NEXT: cmp r5, #0
+; CHECK-THUMB-BE-NEXT: bne .LBB7_1
+; CHECK-THUMB-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-BE-NEXT: strd r3, r2, [r12]
+; CHECK-THUMB-BE-NEXT: pop {r4, r5, r7, pc}
%old = atomicrmw sub ptr @var64, i64 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
-; CHECK: movt r[[ADDR]], :upper16:var64
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
; r0, r1 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-LE-NEXT: subs{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
-; CHECK-LE-NEXT: sbc{{(\.w)?}} [[NEW2:r[0-9]+]], r[[OLD2]], r1
-; CHECK-BE-NEXT: subs{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
-; CHECK-BE-NEXT: sbc{{(\.w)?}} [[NEW1:r[0-9]+]], r[[OLD1]], r0
-; CHECK-NEXT: stlexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: strd r[[OLD1]], r[[OLD2]], [r[[ADDR]]]
+
store i64 %old, ptr @var64
ret void
}
define i8 @test_atomic_load_and_i8(i8 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_and_i8:
+; CHECK-ARM-LABEL: test_atomic_load_and_i8:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var8
+; CHECK-ARM-NEXT: movt r12, :upper16:var8
+; CHECK-ARM-NEXT: .LBB8_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldrexb r1, [r12]
+; CHECK-ARM-NEXT: and r3, r1, r0
+; CHECK-ARM-NEXT: stlexb r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB8_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_and_i8:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var8
+; CHECK-THUMB-NEXT: movt r12, :upper16:var8
+; CHECK-THUMB-NEXT: .LBB8_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrexb r1, [r12]
+; CHECK-THUMB-NEXT: and.w r3, r1, r0
+; CHECK-THUMB-NEXT: stlexb r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB8_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw and ptr @var8, i8 %offset release
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
-; CHECK: movt r[[ADDR]], :upper16:var8
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldrexb r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: and{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
-; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i8 %old
}
define i16 @test_atomic_load_and_i16(i16 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_and_i16:
+; CHECK-ARM-LABEL: test_atomic_load_and_i16:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var16
+; CHECK-ARM-NEXT: movt r12, :upper16:var16
+; CHECK-ARM-NEXT: .LBB9_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldrexh r1, [r12]
+; CHECK-ARM-NEXT: and r3, r1, r0
+; CHECK-ARM-NEXT: strexh r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB9_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_and_i16:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var16
+; CHECK-THUMB-NEXT: movt r12, :upper16:var16
+; CHECK-THUMB-NEXT: .LBB9_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrexh r1, [r12]
+; CHECK-THUMB-NEXT: and.w r3, r1, r0
+; CHECK-THUMB-NEXT: strexh r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB9_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw and ptr @var16, i16 %offset monotonic
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
-; CHECK: movt r[[ADDR]], :upper16:var16
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: and{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
-; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i16 %old
}
define i32 @test_atomic_load_and_i32(i32 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_and_i32:
+; CHECK-ARM-LABEL: test_atomic_load_and_i32:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var32
+; CHECK-ARM-NEXT: movt r12, :upper16:var32
+; CHECK-ARM-NEXT: .LBB10_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldaex r1, [r12]
+; CHECK-ARM-NEXT: and r3, r1, r0
+; CHECK-ARM-NEXT: stlex r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB10_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_and_i32:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var32
+; CHECK-THUMB-NEXT: movt r12, :upper16:var32
+; CHECK-THUMB-NEXT: .LBB10_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldaex r1, [r12]
+; CHECK-THUMB-NEXT: and.w r3, r1, r0
+; CHECK-THUMB-NEXT: stlex r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB10_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw and ptr @var32, i32 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
-; CHECK: movt r[[ADDR]], :upper16:var32
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: and{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
-; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i32 %old
}
define void @test_atomic_load_and_i64(i64 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_and_i64:
+; CHECK-ARM-LABEL: test_atomic_load_and_i64:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: push {r4, r5, r6, r7, r11, lr}
+; CHECK-ARM-NEXT: movw r2, :lower16:var64
+; CHECK-ARM-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-NEXT: .LBB11_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldaexd r4, r5, [r2]
+; CHECK-ARM-NEXT: and r7, r5, r1
+; CHECK-ARM-NEXT: and r6, r4, r0
+; CHECK-ARM-NEXT: strexd r3, r6, r7, [r2]
+; CHECK-ARM-NEXT: cmp r3, #0
+; CHECK-ARM-NEXT: bne .LBB11_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-NEXT: pop {r4, r5, r6, r7, r11, pc}
+;
+; CHECK-THUMB-LABEL: test_atomic_load_and_i64:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-NEXT: movw r12, :lower16:var64
+; CHECK-THUMB-NEXT: movt r12, :upper16:var64
+; CHECK-THUMB-NEXT: .LBB11_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldaexd r3, r2, [r12]
+; CHECK-THUMB-NEXT: and.w lr, r2, r1
+; CHECK-THUMB-NEXT: and.w r4, r3, r0
+; CHECK-THUMB-NEXT: strexd r5, r4, lr, [r12]
+; CHECK-THUMB-NEXT: cmp r5, #0
+; CHECK-THUMB-NEXT: bne .LBB11_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: strd r3, r2, [r12]
+; CHECK-THUMB-NEXT: pop {r4, r5, r7, pc}
%old = atomicrmw and ptr @var64, i64 %offset acquire
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
-; CHECK: movt r[[ADDR]], :upper16:var64
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
; r0, r1 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-LE-DAG: and{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
-; CHECK-LE-DAG: and{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
-; CHECK-BE-DAG: and{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
-; CHECK-BE-DAG: and{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
-; CHECK: strexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: strd r[[OLD1]], r[[OLD2]], [r[[ADDR]]]
+
store i64 %old, ptr @var64
ret void
}
define i8 @test_atomic_load_or_i8(i8 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_or_i8:
+; CHECK-ARM-LABEL: test_atomic_load_or_i8:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var8
+; CHECK-ARM-NEXT: movt r12, :upper16:var8
+; CHECK-ARM-NEXT: .LBB12_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldaexb r1, [r12]
+; CHECK-ARM-NEXT: orr r3, r1, r0
+; CHECK-ARM-NEXT: stlexb r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB12_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_or_i8:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var8
+; CHECK-THUMB-NEXT: movt r12, :upper16:var8
+; CHECK-THUMB-NEXT: .LBB12_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldaexb r1, [r12]
+; CHECK-THUMB-NEXT: orr.w r3, r1, r0
+; CHECK-THUMB-NEXT: stlexb r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB12_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw or ptr @var8, i8 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
-; CHECK: movt r[[ADDR]], :upper16:var8
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: orr{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
-; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i8 %old
}
define i16 @test_atomic_load_or_i16(i16 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_or_i16:
+; CHECK-ARM-LABEL: test_atomic_load_or_i16:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var16
+; CHECK-ARM-NEXT: movt r12, :upper16:var16
+; CHECK-ARM-NEXT: .LBB13_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldrexh r1, [r12]
+; CHECK-ARM-NEXT: orr r3, r1, r0
+; CHECK-ARM-NEXT: strexh r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB13_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_or_i16:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var16
+; CHECK-THUMB-NEXT: movt r12, :upper16:var16
+; CHECK-THUMB-NEXT: .LBB13_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrexh r1, [r12]
+; CHECK-THUMB-NEXT: orr.w r3, r1, r0
+; CHECK-THUMB-NEXT: strexh r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB13_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw or ptr @var16, i16 %offset monotonic
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
-; CHECK: movt r[[ADDR]], :upper16:var16
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: orr{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
-; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i16 %old
}
define i32 @test_atomic_load_or_i32(i32 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_or_i32:
+; CHECK-ARM-LABEL: test_atomic_load_or_i32:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var32
+; CHECK-ARM-NEXT: movt r12, :upper16:var32
+; CHECK-ARM-NEXT: .LBB14_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldaex r1, [r12]
+; CHECK-ARM-NEXT: orr r3, r1, r0
+; CHECK-ARM-NEXT: strex r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB14_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_or_i32:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var32
+; CHECK-THUMB-NEXT: movt r12, :upper16:var32
+; CHECK-THUMB-NEXT: .LBB14_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldaex r1, [r12]
+; CHECK-THUMB-NEXT: orr.w r3, r1, r0
+; CHECK-THUMB-NEXT: strex r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB14_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw or ptr @var32, i32 %offset acquire
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
-; CHECK: movt r[[ADDR]], :upper16:var32
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: orr{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
-; CHECK-NEXT: strex [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i32 %old
}
define void @test_atomic_load_or_i64(i64 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_or_i64:
+; CHECK-ARM-LABEL: test_atomic_load_or_i64:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: push {r4, r5, r6, r7, r11, lr}
+; CHECK-ARM-NEXT: movw r2, :lower16:var64
+; CHECK-ARM-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-NEXT: .LBB15_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldrexd r4, r5, [r2]
+; CHECK-ARM-NEXT: orr r7, r5, r1
+; CHECK-ARM-NEXT: orr r6, r4, r0
+; CHECK-ARM-NEXT: stlexd r3, r6, r7, [r2]
+; CHECK-ARM-NEXT: cmp r3, #0
+; CHECK-ARM-NEXT: bne .LBB15_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-NEXT: pop {r4, r5, r6, r7, r11, pc}
+;
+; CHECK-THUMB-LABEL: test_atomic_load_or_i64:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-NEXT: movw r12, :lower16:var64
+; CHECK-THUMB-NEXT: movt r12, :upper16:var64
+; CHECK-THUMB-NEXT: .LBB15_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrexd r3, r2, [r12]
+; CHECK-THUMB-NEXT: orr.w lr, r2, r1
+; CHECK-THUMB-NEXT: orr.w r4, r3, r0
+; CHECK-THUMB-NEXT: stlexd r5, r4, lr, [r12]
+; CHECK-THUMB-NEXT: cmp r5, #0
+; CHECK-THUMB-NEXT: bne .LBB15_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: strd r3, r2, [r12]
+; CHECK-THUMB-NEXT: pop {r4, r5, r7, pc}
%old = atomicrmw or ptr @var64, i64 %offset release
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
-; CHECK: movt r[[ADDR]], :upper16:var64
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
; r0, r1 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-LE-DAG: orr{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
-; CHECK-LE-DAG: orr{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
-; CHECK-BE-DAG: orr{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
-; CHECK-BE-DAG: orr{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
-; CHECK: stlexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: strd r[[OLD1]], r[[OLD2]], [r[[ADDR]]]
+
store i64 %old, ptr @var64
ret void
}
define i8 @test_atomic_load_xor_i8(i8 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_xor_i8:
+; CHECK-ARM-LABEL: test_atomic_load_xor_i8:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var8
+; CHECK-ARM-NEXT: movt r12, :upper16:var8
+; CHECK-ARM-NEXT: .LBB16_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldaexb r1, [r12]
+; CHECK-ARM-NEXT: eor r3, r1, r0
+; CHECK-ARM-NEXT: strexb r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB16_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_xor_i8:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var8
+; CHECK-THUMB-NEXT: movt r12, :upper16:var8
+; CHECK-THUMB-NEXT: .LBB16_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldaexb r1, [r12]
+; CHECK-THUMB-NEXT: eor.w r3, r1, r0
+; CHECK-THUMB-NEXT: strexb r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB16_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw xor ptr @var8, i8 %offset acquire
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
-; CHECK: movt r[[ADDR]], :upper16:var8
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: eor{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
-; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i8 %old
}
define i16 @test_atomic_load_xor_i16(i16 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_xor_i16:
+; CHECK-ARM-LABEL: test_atomic_load_xor_i16:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var16
+; CHECK-ARM-NEXT: movt r12, :upper16:var16
+; CHECK-ARM-NEXT: .LBB17_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldrexh r1, [r12]
+; CHECK-ARM-NEXT: eor r3, r1, r0
+; CHECK-ARM-NEXT: stlexh r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB17_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_xor_i16:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var16
+; CHECK-THUMB-NEXT: movt r12, :upper16:var16
+; CHECK-THUMB-NEXT: .LBB17_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrexh r1, [r12]
+; CHECK-THUMB-NEXT: eor.w r3, r1, r0
+; CHECK-THUMB-NEXT: stlexh r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB17_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw xor ptr @var16, i16 %offset release
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
-; CHECK: movt r[[ADDR]], :upper16:var16
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: eor{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
-; CHECK-NEXT: stlexh [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i16 %old
}
define i32 @test_atomic_load_xor_i32(i32 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_xor_i32:
+; CHECK-ARM-LABEL: test_atomic_load_xor_i32:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var32
+; CHECK-ARM-NEXT: movt r12, :upper16:var32
+; CHECK-ARM-NEXT: .LBB18_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldaex r1, [r12]
+; CHECK-ARM-NEXT: eor r3, r1, r0
+; CHECK-ARM-NEXT: stlex r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB18_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_xor_i32:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var32
+; CHECK-THUMB-NEXT: movt r12, :upper16:var32
+; CHECK-THUMB-NEXT: .LBB18_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldaex r1, [r12]
+; CHECK-THUMB-NEXT: eor.w r3, r1, r0
+; CHECK-THUMB-NEXT: stlex r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB18_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw xor ptr @var32, i32 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
-; CHECK: movt r[[ADDR]], :upper16:var32
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: eor{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
-; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i32 %old
}
define void @test_atomic_load_xor_i64(i64 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_xor_i64:
+; CHECK-ARM-LABEL: test_atomic_load_xor_i64:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: push {r4, r5, r6, r7, r11, lr}
+; CHECK-ARM-NEXT: movw r2, :lower16:var64
+; CHECK-ARM-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-NEXT: .LBB19_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldrexd r4, r5, [r2]
+; CHECK-ARM-NEXT: eor r7, r5, r1
+; CHECK-ARM-NEXT: eor r6, r4, r0
+; CHECK-ARM-NEXT: strexd r3, r6, r7, [r2]
+; CHECK-ARM-NEXT: cmp r3, #0
+; CHECK-ARM-NEXT: bne .LBB19_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-NEXT: pop {r4, r5, r6, r7, r11, pc}
+;
+; CHECK-THUMB-LABEL: test_atomic_load_xor_i64:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-NEXT: movw r12, :lower16:var64
+; CHECK-THUMB-NEXT: movt r12, :upper16:var64
+; CHECK-THUMB-NEXT: .LBB19_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrexd r3, r2, [r12]
+; CHECK-THUMB-NEXT: eor.w lr, r2, r1
+; CHECK-THUMB-NEXT: eor.w r4, r3, r0
+; CHECK-THUMB-NEXT: strexd r5, r4, lr, [r12]
+; CHECK-THUMB-NEXT: cmp r5, #0
+; CHECK-THUMB-NEXT: bne .LBB19_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: strd r3, r2, [r12]
+; CHECK-THUMB-NEXT: pop {r4, r5, r7, pc}
%old = atomicrmw xor ptr @var64, i64 %offset monotonic
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
-; CHECK: movt r[[ADDR]], :upper16:var64
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
; r0, r1 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-LE-DAG: eor{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
-; CHECK-LE-DAG: eor{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
-; CHECK-BE-DAG: eor{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
-; CHECK-BE-DAG: eor{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
-; CHECK: strexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: strd r[[OLD1]], r[[OLD2]], [r[[ADDR]]]
+
store i64 %old, ptr @var64
ret void
}
define i8 @test_atomic_load_xchg_i8(i8 %offset) nounwind {
; CHECK-LABEL: test_atomic_load_xchg_i8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: movw r2, :lower16:var8
+; CHECK-NEXT: movt r2, :upper16:var8
+; CHECK-NEXT: .LBB20_1: @ %atomicrmw.start
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldrexb r1, [r2]
+; CHECK-NEXT: strexb r3, r0, [r2]
+; CHECK-NEXT: cmp r3, #0
+; CHECK-NEXT: bne .LBB20_1
+; CHECK-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
%old = atomicrmw xchg ptr @var8, i8 %offset monotonic
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
-; CHECK: movt r[[ADDR]], :upper16:var8
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldrexb r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], r0, [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: mov r0, r[[OLD]]
ret i8 %old
}
define i16 @test_atomic_load_xchg_i16(i16 %offset) nounwind {
; CHECK-LABEL: test_atomic_load_xchg_i16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: movw r2, :lower16:var16
+; CHECK-NEXT: movt r2, :upper16:var16
+; CHECK-NEXT: .LBB21_1: @ %atomicrmw.start
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldaexh r1, [r2]
+; CHECK-NEXT: stlexh r3, r0, [r2]
+; CHECK-NEXT: cmp r3, #0
+; CHECK-NEXT: bne .LBB21_1
+; CHECK-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
%old = atomicrmw xchg ptr @var16, i16 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
-; CHECK: movt r[[ADDR]], :upper16:var16
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: stlexh [[STATUS:r[0-9]+]], r0, [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: mov r0, r[[OLD]]
ret i16 %old
}
define i32 @test_atomic_load_xchg_i32(i32 %offset) nounwind {
; CHECK-LABEL: test_atomic_load_xchg_i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: movw r2, :lower16:var32
+; CHECK-NEXT: movt r2, :upper16:var32
+; CHECK-NEXT: .LBB22_1: @ %atomicrmw.start
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldrex r1, [r2]
+; CHECK-NEXT: stlex r3, r0, [r2]
+; CHECK-NEXT: cmp r3, #0
+; CHECK-NEXT: bne .LBB22_1
+; CHECK-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
%old = atomicrmw xchg ptr @var32, i32 %offset release
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
-; CHECK: movt r[[ADDR]], :upper16:var32
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], r0, [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: mov r0, r[[OLD]]
ret i32 %old
}
define void @test_atomic_load_xchg_i64(i64 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_xchg_i64:
+; CHECK-ARM-LABEL: test_atomic_load_xchg_i64:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: push {r4, r5, r11, lr}
+; CHECK-ARM-NEXT: movw r2, :lower16:var64
+; CHECK-ARM-NEXT: @ kill: def $r1 killed $r1 killed $r0_r1 def $r0_r1
+; CHECK-ARM-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-NEXT: @ kill: def $r0 killed $r0 killed $r0_r1 def $r0_r1
+; CHECK-ARM-NEXT: .LBB23_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldaexd r4, r5, [r2]
+; CHECK-ARM-NEXT: strexd r3, r0, r1, [r2]
+; CHECK-ARM-NEXT: cmp r3, #0
+; CHECK-ARM-NEXT: bne .LBB23_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-NEXT: pop {r4, r5, r11, pc}
+;
+; CHECK-THUMB-LABEL: test_atomic_load_xchg_i64:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: push {r7, lr}
+; CHECK-THUMB-NEXT: movw r12, :lower16:var64
+; CHECK-THUMB-NEXT: movt r12, :upper16:var64
+; CHECK-THUMB-NEXT: .LBB23_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldaexd r3, lr, [r12]
+; CHECK-THUMB-NEXT: strexd r2, r0, r1, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB23_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: strd r3, lr, [r12]
+; CHECK-THUMB-NEXT: pop {r7, pc}
%old = atomicrmw xchg ptr @var64, i64 %offset acquire
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
-; CHECK: movt r[[ADDR]], :upper16:var64
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaexd [[OLD1:r[0-9]+]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]]
; r0, r1 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], r0, r1, [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: strd [[OLD1]], [[OLD2]], [r[[ADDR]]]
store i64 %old, ptr @var64
ret void
}
define i8 @test_atomic_load_min_i8(i8 signext %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_min_i8:
+; CHECK-ARM-LABEL: test_atomic_load_min_i8:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var8
+; CHECK-ARM-NEXT: movt r12, :upper16:var8
+; CHECK-ARM-NEXT: .LBB24_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldaexb r1, [r12]
+; CHECK-ARM-NEXT: sxtb r3, r1
+; CHECK-ARM-NEXT: cmp r3, r0
+; CHECK-ARM-NEXT: mov r3, r0
+; CHECK-ARM-NEXT: movle r3, r1
+; CHECK-ARM-NEXT: strexb r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB24_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_min_i8:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var8
+; CHECK-THUMB-NEXT: movt r12, :upper16:var8
+; CHECK-THUMB-NEXT: .LBB24_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldaexb r1, [r12]
+; CHECK-THUMB-NEXT: sxtb r3, r1
+; CHECK-THUMB-NEXT: cmp r3, r0
+; CHECK-THUMB-NEXT: mov r3, r0
+; CHECK-THUMB-NEXT: it le
+; CHECK-THUMB-NEXT: movle r3, r1
+; CHECK-THUMB-NEXT: strexb r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB24_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw min ptr @var8, i8 %offset acquire
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK-DAG: movw [[ADDR:r[0-9]+|lr]], :lower16:var8
-; CHECK-DAG: movt [[ADDR]], :upper16:var8
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaexb r[[OLD:[0-9]+]], {{.*}}[[ADDR]]
-; CHECK-NEXT: sxtb r[[OLDX:[0-9]+]], r[[OLD]]
+
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp r[[OLDX]], r0
; Thumb mode: it le
-; CHECK: movle r[[OLDX]], r[[OLD]]
-; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], r[[OLDX]], {{.*}}[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i8 %old
}
define i16 @test_atomic_load_min_i16(i16 signext %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_min_i16:
+; CHECK-ARM-LABEL: test_atomic_load_min_i16:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var16
+; CHECK-ARM-NEXT: movt r12, :upper16:var16
+; CHECK-ARM-NEXT: .LBB25_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldrexh r1, [r12]
+; CHECK-ARM-NEXT: sxth r3, r1
+; CHECK-ARM-NEXT: cmp r3, r0
+; CHECK-ARM-NEXT: mov r3, r0
+; CHECK-ARM-NEXT: movle r3, r1
+; CHECK-ARM-NEXT: stlexh r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB25_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_min_i16:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var16
+; CHECK-THUMB-NEXT: movt r12, :upper16:var16
+; CHECK-THUMB-NEXT: .LBB25_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrexh r1, [r12]
+; CHECK-THUMB-NEXT: sxth r3, r1
+; CHECK-THUMB-NEXT: cmp r3, r0
+; CHECK-THUMB-NEXT: mov r3, r0
+; CHECK-THUMB-NEXT: it le
+; CHECK-THUMB-NEXT: movle r3, r1
+; CHECK-THUMB-NEXT: stlexh r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB25_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw min ptr @var16, i16 %offset release
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw [[ADDR:r[0-9]+|lr]], :lower16:var16
-; CHECK: movt [[ADDR]], :upper16:var16
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldrexh r[[OLD:[0-9]+]], {{.*}}[[ADDR]]
-; CHECK-NEXT: sxth r[[OLDX:[0-9]+]], r[[OLD]]
+
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp r[[OLDX]], r0
; Thumb mode: it le
-; CHECK: movle r[[OLDX]], r[[OLD]]
-; CHECK-NEXT: stlexh [[STATUS:r[0-9]+]], r[[OLDX]], {{.*}}[[ADDR]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i16 %old
}
define i32 @test_atomic_load_min_i32(i32 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_min_i32:
+; CHECK-ARM-LABEL: test_atomic_load_min_i32:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var32
+; CHECK-ARM-NEXT: movt r12, :upper16:var32
+; CHECK-ARM-NEXT: .LBB26_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldrex r1, [r12]
+; CHECK-ARM-NEXT: mov r3, r0
+; CHECK-ARM-NEXT: cmp r1, r0
+; CHECK-ARM-NEXT: movle r3, r1
+; CHECK-ARM-NEXT: strex r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB26_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_min_i32:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var32
+; CHECK-THUMB-NEXT: movt r12, :upper16:var32
+; CHECK-THUMB-NEXT: .LBB26_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrex r1, [r12]
+; CHECK-THUMB-NEXT: mov r3, r0
+; CHECK-THUMB-NEXT: cmp r1, r0
+; CHECK-THUMB-NEXT: it le
+; CHECK-THUMB-NEXT: movle r3, r1
+; CHECK-THUMB-NEXT: strex r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB26_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw min ptr @var32, i32 %offset monotonic
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
-; CHECK: movt r[[ADDR]], :upper16:var32
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
-; CHECK-NEXT: cmp r[[OLD]], r0
; Thumb mode: it le
-; CHECK: movle r[[NEW]], r[[OLD]]
-; CHECK-NEXT: strex [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i32 %old
}
define void @test_atomic_load_min_i64(i64 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_min_i64:
+; CHECK-ARM-LE-LABEL: test_atomic_load_min_i64:
+; CHECK-ARM-LE: @ %bb.0:
+; CHECK-ARM-LE-NEXT: push {r4, r5, r6, r7, r11, lr}
+; CHECK-ARM-LE-NEXT: movw r2, :lower16:var64
+; CHECK-ARM-LE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-LE-NEXT: .LBB27_1: @ %atomicrmw.start
+; CHECK-ARM-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-LE-NEXT: ldaexd r4, r5, [r2]
+; CHECK-ARM-LE-NEXT: mov r7, r1
+; CHECK-ARM-LE-NEXT: subs r3, r0, r4
+; CHECK-ARM-LE-NEXT: sbcs r3, r1, r5
+; CHECK-ARM-LE-NEXT: mov r3, #0
+; CHECK-ARM-LE-NEXT: movwge r3, #1
+; CHECK-ARM-LE-NEXT: cmp r3, #0
+; CHECK-ARM-LE-NEXT: movne r7, r5
+; CHECK-ARM-LE-NEXT: mov r6, r0
+; CHECK-ARM-LE-NEXT: movne r6, r4
+; CHECK-ARM-LE-NEXT: stlexd r3, r6, r7, [r2]
+; CHECK-ARM-LE-NEXT: cmp r3, #0
+; CHECK-ARM-LE-NEXT: bne .LBB27_1
+; CHECK-ARM-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-LE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-LE-NEXT: pop {r4, r5, r6, r7, r11, pc}
+;
+; CHECK-ARM-BE-LABEL: test_atomic_load_min_i64:
+; CHECK-ARM-BE: @ %bb.0:
+; CHECK-ARM-BE-NEXT: push {r4, r5, r6, r7, r11, lr}
+; CHECK-ARM-BE-NEXT: movw r2, :lower16:var64
+; CHECK-ARM-BE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-BE-NEXT: .LBB27_1: @ %atomicrmw.start
+; CHECK-ARM-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-BE-NEXT: ldaexd r4, r5, [r2]
+; CHECK-ARM-BE-NEXT: mov r7, r1
+; CHECK-ARM-BE-NEXT: subs r3, r1, r5
+; CHECK-ARM-BE-NEXT: sbcs r3, r0, r4
+; CHECK-ARM-BE-NEXT: mov r3, #0
+; CHECK-ARM-BE-NEXT: movwge r3, #1
+; CHECK-ARM-BE-NEXT: cmp r3, #0
+; CHECK-ARM-BE-NEXT: movne r7, r5
+; CHECK-ARM-BE-NEXT: mov r6, r0
+; CHECK-ARM-BE-NEXT: movne r6, r4
+; CHECK-ARM-BE-NEXT: stlexd r3, r6, r7, [r2]
+; CHECK-ARM-BE-NEXT: cmp r3, #0
+; CHECK-ARM-BE-NEXT: bne .LBB27_1
+; CHECK-ARM-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-BE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-BE-NEXT: pop {r4, r5, r6, r7, r11, pc}
+;
+; CHECK-THUMB-LE-LABEL: test_atomic_load_min_i64:
+; CHECK-THUMB-LE: @ %bb.0:
+; CHECK-THUMB-LE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-LE-NEXT: movw r12, :lower16:var64
+; CHECK-THUMB-LE-NEXT: movt r12, :upper16:var64
+; CHECK-THUMB-LE-NEXT: .LBB27_1: @ %atomicrmw.start
+; CHECK-THUMB-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-LE-NEXT: ldaexd r3, lr, [r12]
+; CHECK-THUMB-LE-NEXT: mov r4, r0
+; CHECK-THUMB-LE-NEXT: subs r2, r0, r3
+; CHECK-THUMB-LE-NEXT: sbcs.w r2, r1, lr
+; CHECK-THUMB-LE-NEXT: mov.w r2, #0
+; CHECK-THUMB-LE-NEXT: it ge
+; CHECK-THUMB-LE-NEXT: movge r2, #1
+; CHECK-THUMB-LE-NEXT: cmp r2, #0
+; CHECK-THUMB-LE-NEXT: mov r2, r1
+; CHECK-THUMB-LE-NEXT: itt ne
+; CHECK-THUMB-LE-NEXT: movne r2, lr
+; CHECK-THUMB-LE-NEXT: movne r4, r3
+; CHECK-THUMB-LE-NEXT: stlexd r5, r4, r2, [r12]
+; CHECK-THUMB-LE-NEXT: cmp r5, #0
+; CHECK-THUMB-LE-NEXT: bne .LBB27_1
+; CHECK-THUMB-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-LE-NEXT: strd r3, lr, [r12]
+; CHECK-THUMB-LE-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-THUMB-BE-LABEL: test_atomic_load_min_i64:
+; CHECK-THUMB-BE: @ %bb.0:
+; CHECK-THUMB-BE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-BE-NEXT: movw r12, :lower16:var64
+; CHECK-THUMB-BE-NEXT: movt r12, :upper16:var64
+; CHECK-THUMB-BE-NEXT: .LBB27_1: @ %atomicrmw.start
+; CHECK-THUMB-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-BE-NEXT: ldaexd r3, lr, [r12]
+; CHECK-THUMB-BE-NEXT: mov r4, r0
+; CHECK-THUMB-BE-NEXT: subs.w r2, r1, lr
+; CHECK-THUMB-BE-NEXT: sbcs.w r2, r0, r3
+; CHECK-THUMB-BE-NEXT: mov.w r2, #0
+; CHECK-THUMB-BE-NEXT: it ge
+; CHECK-THUMB-BE-NEXT: movge r2, #1
+; CHECK-THUMB-BE-NEXT: cmp r2, #0
+; CHECK-THUMB-BE-NEXT: mov r2, r1
+; CHECK-THUMB-BE-NEXT: itt ne
+; CHECK-THUMB-BE-NEXT: movne r2, lr
+; CHECK-THUMB-BE-NEXT: movne r4, r3
+; CHECK-THUMB-BE-NEXT: stlexd r5, r4, r2, [r12]
+; CHECK-THUMB-BE-NEXT: cmp r5, #0
+; CHECK-THUMB-BE-NEXT: bne .LBB27_1
+; CHECK-THUMB-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-BE-NEXT: strd r3, lr, [r12]
+; CHECK-THUMB-BE-NEXT: pop {r4, r5, r7, pc}
%old = atomicrmw min ptr @var64, i64 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
-; CHECK: movt r[[ADDR]], :upper16:var64
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaexd [[OLD1:r[0-9]+|lr]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]]
; r0, r1 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-ARM: mov [[MINHI:r[0-9]+]], r1
-; CHECK-ARM-LE: subs {{[^,]+}}, r0, [[OLD1]]
-; CHECK-ARM-LE: sbcs {{[^,]+}}, r1, [[OLD2]]
-; CHECK-ARM-BE: subs {{[^,]+}}, r1, [[OLD2]]
-; CHECK-ARM-BE: sbcs {{[^,]+}}, r0, [[OLD1]]
-; CHECK-ARM: mov [[CMP:r[0-9]+|lr]], #0
-; CHECK-ARM: movwge [[CMP:r[0-9]+|lr]], #1
-; CHECK-ARM: cmp [[CMP:r[0-9]+|lr]], #0
-; CHECK-ARM: movne [[MINHI]], [[OLD2]]
-; CHECK-ARM: mov [[MINLO:r[0-9]+]], r0
-; CHECK-ARM: movne [[MINLO]], [[OLD1]]
-; CHECK-ARM: stlexd [[STATUS:r[0-9]+]], [[MINLO]], [[MINHI]], [r[[ADDR]]]
-; CHECK-THUMB: stlexd [[STATUS:r[0-9]+]], {{r[0-9]+}}, {{r[0-9]+}}, [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK-ARM: strd [[OLD1]], [[OLD2]], [r[[ADDR]]]
+
store i64 %old, ptr @var64
ret void
}
define i8 @test_atomic_load_max_i8(i8 signext %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_max_i8:
+; CHECK-ARM-LABEL: test_atomic_load_max_i8:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var8
+; CHECK-ARM-NEXT: movt r12, :upper16:var8
+; CHECK-ARM-NEXT: .LBB28_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldaexb r1, [r12]
+; CHECK-ARM-NEXT: sxtb r3, r1
+; CHECK-ARM-NEXT: cmp r3, r0
+; CHECK-ARM-NEXT: mov r3, r0
+; CHECK-ARM-NEXT: movgt r3, r1
+; CHECK-ARM-NEXT: stlexb r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB28_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_max_i8:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var8
+; CHECK-THUMB-NEXT: movt r12, :upper16:var8
+; CHECK-THUMB-NEXT: .LBB28_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldaexb r1, [r12]
+; CHECK-THUMB-NEXT: sxtb r3, r1
+; CHECK-THUMB-NEXT: cmp r3, r0
+; CHECK-THUMB-NEXT: mov r3, r0
+; CHECK-THUMB-NEXT: it gt
+; CHECK-THUMB-NEXT: movgt r3, r1
+; CHECK-THUMB-NEXT: stlexb r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB28_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw max ptr @var8, i8 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw [[ADDR:r[0-9]+|lr]], :lower16:var8
-; CHECK: movt [[ADDR]], :upper16:var8
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaexb r[[OLD:[0-9]+]], {{.*}}[[ADDR]]
-; CHECK-NEXT: sxtb r[[OLDX:[0-9]+]], r[[OLD]]
+
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp r[[OLDX]], r0
; Thumb mode: it gt
-; CHECK: movgt r[[OLDX]], r[[OLD]]
-; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], r[[OLDX]], {{.*}}[[ADDR]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i8 %old
}
define i16 @test_atomic_load_max_i16(i16 signext %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_max_i16:
+; CHECK-ARM-LABEL: test_atomic_load_max_i16:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var16
+; CHECK-ARM-NEXT: movt r12, :upper16:var16
+; CHECK-ARM-NEXT: .LBB29_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldaexh r1, [r12]
+; CHECK-ARM-NEXT: sxth r3, r1
+; CHECK-ARM-NEXT: cmp r3, r0
+; CHECK-ARM-NEXT: mov r3, r0
+; CHECK-ARM-NEXT: movgt r3, r1
+; CHECK-ARM-NEXT: strexh r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB29_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_max_i16:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var16
+; CHECK-THUMB-NEXT: movt r12, :upper16:var16
+; CHECK-THUMB-NEXT: .LBB29_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldaexh r1, [r12]
+; CHECK-THUMB-NEXT: sxth r3, r1
+; CHECK-THUMB-NEXT: cmp r3, r0
+; CHECK-THUMB-NEXT: mov r3, r0
+; CHECK-THUMB-NEXT: it gt
+; CHECK-THUMB-NEXT: movgt r3, r1
+; CHECK-THUMB-NEXT: strexh r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB29_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw max ptr @var16, i16 %offset acquire
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
-; CHECK: movt r[[ADDR]], :upper16:var16
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
-; CHECK-NEXT: sxth r[[OLDX:[0-9]+]], r[[OLD]]
+
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp r[[OLDX]], r0
; Thumb mode: it gt
-; CHECK: movgt r[[OLDX]], r[[OLD]]
-; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], r[[OLDX]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i16 %old
}
define i32 @test_atomic_load_max_i32(i32 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_max_i32:
+; CHECK-ARM-LABEL: test_atomic_load_max_i32:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var32
+; CHECK-ARM-NEXT: movt r12, :upper16:var32
+; CHECK-ARM-NEXT: .LBB30_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldrex r1, [r12]
+; CHECK-ARM-NEXT: mov r3, r0
+; CHECK-ARM-NEXT: cmp r1, r0
+; CHECK-ARM-NEXT: movgt r3, r1
+; CHECK-ARM-NEXT: stlex r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB30_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_max_i32:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var32
+; CHECK-THUMB-NEXT: movt r12, :upper16:var32
+; CHECK-THUMB-NEXT: .LBB30_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrex r1, [r12]
+; CHECK-THUMB-NEXT: mov r3, r0
+; CHECK-THUMB-NEXT: cmp r1, r0
+; CHECK-THUMB-NEXT: it gt
+; CHECK-THUMB-NEXT: movgt r3, r1
+; CHECK-THUMB-NEXT: stlex r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB30_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw max ptr @var32, i32 %offset release
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
-; CHECK: movt r[[ADDR]], :upper16:var32
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
-; CHECK-NEXT: cmp r[[OLD]], r0
; Thumb mode: it gt
-; CHECK: movgt r[[NEW]], r[[OLD]]
-; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i32 %old
}
define void @test_atomic_load_max_i64(i64 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_max_i64:
+; CHECK-ARM-LE-LABEL: test_atomic_load_max_i64:
+; CHECK-ARM-LE: @ %bb.0:
+; CHECK-ARM-LE-NEXT: push {r4, r5, r6, r7, r11, lr}
+; CHECK-ARM-LE-NEXT: movw r2, :lower16:var64
+; CHECK-ARM-LE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-LE-NEXT: .LBB31_1: @ %atomicrmw.start
+; CHECK-ARM-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-LE-NEXT: ldrexd r4, r5, [r2]
+; CHECK-ARM-LE-NEXT: mov r7, r1
+; CHECK-ARM-LE-NEXT: subs r3, r0, r4
+; CHECK-ARM-LE-NEXT: sbcs r3, r1, r5
+; CHECK-ARM-LE-NEXT: mov r3, #0
+; CHECK-ARM-LE-NEXT: movwlt r3, #1
+; CHECK-ARM-LE-NEXT: cmp r3, #0
+; CHECK-ARM-LE-NEXT: movne r7, r5
+; CHECK-ARM-LE-NEXT: mov r6, r0
+; CHECK-ARM-LE-NEXT: movne r6, r4
+; CHECK-ARM-LE-NEXT: strexd r3, r6, r7, [r2]
+; CHECK-ARM-LE-NEXT: cmp r3, #0
+; CHECK-ARM-LE-NEXT: bne .LBB31_1
+; CHECK-ARM-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-LE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-LE-NEXT: pop {r4, r5, r6, r7, r11, pc}
+;
+; CHECK-ARM-BE-LABEL: test_atomic_load_max_i64:
+; CHECK-ARM-BE: @ %bb.0:
+; CHECK-ARM-BE-NEXT: push {r4, r5, r6, r7, r11, lr}
+; CHECK-ARM-BE-NEXT: movw r2, :lower16:var64
+; CHECK-ARM-BE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-BE-NEXT: .LBB31_1: @ %atomicrmw.start
+; CHECK-ARM-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-BE-NEXT: ldrexd r4, r5, [r2]
+; CHECK-ARM-BE-NEXT: mov r7, r1
+; CHECK-ARM-BE-NEXT: subs r3, r1, r5
+; CHECK-ARM-BE-NEXT: sbcs r3, r0, r4
+; CHECK-ARM-BE-NEXT: mov r3, #0
+; CHECK-ARM-BE-NEXT: movwlt r3, #1
+; CHECK-ARM-BE-NEXT: cmp r3, #0
+; CHECK-ARM-BE-NEXT: movne r7, r5
+; CHECK-ARM-BE-NEXT: mov r6, r0
+; CHECK-ARM-BE-NEXT: movne r6, r4
+; CHECK-ARM-BE-NEXT: strexd r3, r6, r7, [r2]
+; CHECK-ARM-BE-NEXT: cmp r3, #0
+; CHECK-ARM-BE-NEXT: bne .LBB31_1
+; CHECK-ARM-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-BE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-BE-NEXT: pop {r4, r5, r6, r7, r11, pc}
+;
+; CHECK-THUMB-LE-LABEL: test_atomic_load_max_i64:
+; CHECK-THUMB-LE: @ %bb.0:
+; CHECK-THUMB-LE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-LE-NEXT: movw r12, :lower16:var64
+; CHECK-THUMB-LE-NEXT: movt r12, :upper16:var64
+; CHECK-THUMB-LE-NEXT: .LBB31_1: @ %atomicrmw.start
+; CHECK-THUMB-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-LE-NEXT: ldrexd r3, lr, [r12]
+; CHECK-THUMB-LE-NEXT: mov r4, r0
+; CHECK-THUMB-LE-NEXT: subs r2, r0, r3
+; CHECK-THUMB-LE-NEXT: sbcs.w r2, r1, lr
+; CHECK-THUMB-LE-NEXT: mov.w r2, #0
+; CHECK-THUMB-LE-NEXT: it lt
+; CHECK-THUMB-LE-NEXT: movlt r2, #1
+; CHECK-THUMB-LE-NEXT: cmp r2, #0
+; CHECK-THUMB-LE-NEXT: mov r2, r1
+; CHECK-THUMB-LE-NEXT: itt ne
+; CHECK-THUMB-LE-NEXT: movne r2, lr
+; CHECK-THUMB-LE-NEXT: movne r4, r3
+; CHECK-THUMB-LE-NEXT: strexd r5, r4, r2, [r12]
+; CHECK-THUMB-LE-NEXT: cmp r5, #0
+; CHECK-THUMB-LE-NEXT: bne .LBB31_1
+; CHECK-THUMB-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-LE-NEXT: strd r3, lr, [r12]
+; CHECK-THUMB-LE-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-THUMB-BE-LABEL: test_atomic_load_max_i64:
+; CHECK-THUMB-BE: @ %bb.0:
+; CHECK-THUMB-BE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-BE-NEXT: movw r12, :lower16:var64
+; CHECK-THUMB-BE-NEXT: movt r12, :upper16:var64
+; CHECK-THUMB-BE-NEXT: .LBB31_1: @ %atomicrmw.start
+; CHECK-THUMB-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-BE-NEXT: ldrexd r3, lr, [r12]
+; CHECK-THUMB-BE-NEXT: mov r4, r0
+; CHECK-THUMB-BE-NEXT: subs.w r2, r1, lr
+; CHECK-THUMB-BE-NEXT: sbcs.w r2, r0, r3
+; CHECK-THUMB-BE-NEXT: mov.w r2, #0
+; CHECK-THUMB-BE-NEXT: it lt
+; CHECK-THUMB-BE-NEXT: movlt r2, #1
+; CHECK-THUMB-BE-NEXT: cmp r2, #0
+; CHECK-THUMB-BE-NEXT: mov r2, r1
+; CHECK-THUMB-BE-NEXT: itt ne
+; CHECK-THUMB-BE-NEXT: movne r2, lr
+; CHECK-THUMB-BE-NEXT: movne r4, r3
+; CHECK-THUMB-BE-NEXT: strexd r5, r4, r2, [r12]
+; CHECK-THUMB-BE-NEXT: cmp r5, #0
+; CHECK-THUMB-BE-NEXT: bne .LBB31_1
+; CHECK-THUMB-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-BE-NEXT: strd r3, lr, [r12]
+; CHECK-THUMB-BE-NEXT: pop {r4, r5, r7, pc}
%old = atomicrmw max ptr @var64, i64 %offset monotonic
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
-; CHECK: movt r[[ADDR]], :upper16:var64
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldrexd [[OLD1:r[0-9]+]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]]
; r0, r1 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-ARM: mov [[MINHI:r[0-9]+]], r1
-; CHECK-ARM-LE: subs {{[^,]+}}, r0, [[OLD1]]
-; CHECK-ARM-LE: sbcs {{[^,]+}}, r1, [[OLD2]]
-; CHECK-ARM-BE: subs {{[^,]+}}, r1, [[OLD2]]
-; CHECK-ARM-BE: sbcs {{[^,]+}}, r0, [[OLD1]]
-; CHECK-ARM: mov [[CMP:r[0-9]+|lr]], #0
-; CHECK-ARM: movwlt [[CMP:r[0-9]+|lr]], #1
-; CHECK-ARM: cmp [[CMP:r[0-9]+|lr]], #0
-; CHECK-ARM: movne [[MINHI]], [[OLD2]]
-; CHECK-ARM: mov [[MINLO:r[0-9]+]], r0
-; CHECK-ARM: movne [[MINLO]], [[OLD1]]
-; CHECK-ARM: strexd [[STATUS:r[0-9]+]], [[MINLO]], [[MINHI]], [r[[ADDR]]]
-; CHECK-THUMB: strexd [[STATUS:r[0-9]+]], {{r[0-9]+}}, {{r[0-9]+}}, [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK-ARM: strd [[OLD1]], [[OLD2]], [r[[ADDR]]]
+
store i64 %old, ptr @var64
ret void
}
define i8 @test_atomic_load_umin_i8(i8 zeroext %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_umin_i8:
+; CHECK-ARM-LABEL: test_atomic_load_umin_i8:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var8
+; CHECK-ARM-NEXT: movt r12, :upper16:var8
+; CHECK-ARM-NEXT: .LBB32_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldrexb r1, [r12]
+; CHECK-ARM-NEXT: mov r3, r0
+; CHECK-ARM-NEXT: cmp r1, r0
+; CHECK-ARM-NEXT: movls r3, r1
+; CHECK-ARM-NEXT: strexb r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB32_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_umin_i8:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var8
+; CHECK-THUMB-NEXT: movt r12, :upper16:var8
+; CHECK-THUMB-NEXT: .LBB32_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrexb r1, [r12]
+; CHECK-THUMB-NEXT: mov r3, r0
+; CHECK-THUMB-NEXT: cmp r1, r0
+; CHECK-THUMB-NEXT: it ls
+; CHECK-THUMB-NEXT: movls r3, r1
+; CHECK-THUMB-NEXT: strexb r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB32_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw umin ptr @var8, i8 %offset monotonic
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw [[ADDR:r[0-9]+|lr]], :lower16:var8
-; CHECK: movt [[ADDR]], :upper16:var8
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldrexb r[[OLD:[0-9]+]], {{.*}}[[ADDR]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
-; CHECK-NEXT: cmp r[[OLD]], r0
; Thumb mode: it ls
-; CHECK: movls r[[NEW]], r[[OLD]]
-; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], r[[NEW]], {{.*}}[[ADDR]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i8 %old
}
define i16 @test_atomic_load_umin_i16(i16 zeroext %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_umin_i16:
+; CHECK-ARM-LABEL: test_atomic_load_umin_i16:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var16
+; CHECK-ARM-NEXT: movt r12, :upper16:var16
+; CHECK-ARM-NEXT: .LBB33_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldaexh r1, [r12]
+; CHECK-ARM-NEXT: mov r3, r0
+; CHECK-ARM-NEXT: cmp r1, r0
+; CHECK-ARM-NEXT: movls r3, r1
+; CHECK-ARM-NEXT: strexh r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB33_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_umin_i16:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var16
+; CHECK-THUMB-NEXT: movt r12, :upper16:var16
+; CHECK-THUMB-NEXT: .LBB33_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldaexh r1, [r12]
+; CHECK-THUMB-NEXT: mov r3, r0
+; CHECK-THUMB-NEXT: cmp r1, r0
+; CHECK-THUMB-NEXT: it ls
+; CHECK-THUMB-NEXT: movls r3, r1
+; CHECK-THUMB-NEXT: strexh r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB33_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw umin ptr @var16, i16 %offset acquire
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw [[ADDR:r[0-9]+|lr]], :lower16:var16
-; CHECK: movt [[ADDR]], :upper16:var16
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaexh r[[OLD:[0-9]+]], {{.*}}[[ADDR]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
-; CHECK-NEXT: cmp r[[OLD]], r0
; Thumb mode: it ls
-; CHECK: movls r[[NEW]], r[[OLD]]
-; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], r[[NEW]], {{.*}}[[ADDR]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i16 %old
}
define i32 @test_atomic_load_umin_i32(i32 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_umin_i32:
+; CHECK-ARM-LABEL: test_atomic_load_umin_i32:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var32
+; CHECK-ARM-NEXT: movt r12, :upper16:var32
+; CHECK-ARM-NEXT: .LBB34_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldaex r1, [r12]
+; CHECK-ARM-NEXT: mov r3, r0
+; CHECK-ARM-NEXT: cmp r1, r0
+; CHECK-ARM-NEXT: movls r3, r1
+; CHECK-ARM-NEXT: stlex r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB34_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_umin_i32:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var32
+; CHECK-THUMB-NEXT: movt r12, :upper16:var32
+; CHECK-THUMB-NEXT: .LBB34_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldaex r1, [r12]
+; CHECK-THUMB-NEXT: mov r3, r0
+; CHECK-THUMB-NEXT: cmp r1, r0
+; CHECK-THUMB-NEXT: it ls
+; CHECK-THUMB-NEXT: movls r3, r1
+; CHECK-THUMB-NEXT: stlex r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB34_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw umin ptr @var32, i32 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
-; CHECK: movt r[[ADDR]], :upper16:var32
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
-; CHECK-NEXT: cmp r[[OLD]], r0
; Thumb mode: it ls
-; CHECK: movls r[[NEW]], r[[OLD]]
-; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i32 %old
}
define void @test_atomic_load_umin_i64(i64 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_umin_i64:
+; CHECK-ARM-LE-LABEL: test_atomic_load_umin_i64:
+; CHECK-ARM-LE: @ %bb.0:
+; CHECK-ARM-LE-NEXT: push {r4, r5, r6, r7, r11, lr}
+; CHECK-ARM-LE-NEXT: movw r2, :lower16:var64
+; CHECK-ARM-LE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-LE-NEXT: .LBB35_1: @ %atomicrmw.start
+; CHECK-ARM-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-LE-NEXT: ldaexd r4, r5, [r2]
+; CHECK-ARM-LE-NEXT: mov r7, r1
+; CHECK-ARM-LE-NEXT: subs r3, r0, r4
+; CHECK-ARM-LE-NEXT: sbcs r3, r1, r5
+; CHECK-ARM-LE-NEXT: mov r3, #0
+; CHECK-ARM-LE-NEXT: movwhs r3, #1
+; CHECK-ARM-LE-NEXT: cmp r3, #0
+; CHECK-ARM-LE-NEXT: movne r7, r5
+; CHECK-ARM-LE-NEXT: mov r6, r0
+; CHECK-ARM-LE-NEXT: movne r6, r4
+; CHECK-ARM-LE-NEXT: stlexd r3, r6, r7, [r2]
+; CHECK-ARM-LE-NEXT: cmp r3, #0
+; CHECK-ARM-LE-NEXT: bne .LBB35_1
+; CHECK-ARM-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-LE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-LE-NEXT: pop {r4, r5, r6, r7, r11, pc}
+;
+; CHECK-ARM-BE-LABEL: test_atomic_load_umin_i64:
+; CHECK-ARM-BE: @ %bb.0:
+; CHECK-ARM-BE-NEXT: push {r4, r5, r6, r7, r11, lr}
+; CHECK-ARM-BE-NEXT: movw r2, :lower16:var64
+; CHECK-ARM-BE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-BE-NEXT: .LBB35_1: @ %atomicrmw.start
+; CHECK-ARM-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-BE-NEXT: ldaexd r4, r5, [r2]
+; CHECK-ARM-BE-NEXT: mov r7, r1
+; CHECK-ARM-BE-NEXT: subs r3, r1, r5
+; CHECK-ARM-BE-NEXT: sbcs r3, r0, r4
+; CHECK-ARM-BE-NEXT: mov r3, #0
+; CHECK-ARM-BE-NEXT: movwhs r3, #1
+; CHECK-ARM-BE-NEXT: cmp r3, #0
+; CHECK-ARM-BE-NEXT: movne r7, r5
+; CHECK-ARM-BE-NEXT: mov r6, r0
+; CHECK-ARM-BE-NEXT: movne r6, r4
+; CHECK-ARM-BE-NEXT: stlexd r3, r6, r7, [r2]
+; CHECK-ARM-BE-NEXT: cmp r3, #0
+; CHECK-ARM-BE-NEXT: bne .LBB35_1
+; CHECK-ARM-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-BE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-BE-NEXT: pop {r4, r5, r6, r7, r11, pc}
+;
+; CHECK-THUMB-LE-LABEL: test_atomic_load_umin_i64:
+; CHECK-THUMB-LE: @ %bb.0:
+; CHECK-THUMB-LE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-LE-NEXT: movw r12, :lower16:var64
+; CHECK-THUMB-LE-NEXT: movt r12, :upper16:var64
+; CHECK-THUMB-LE-NEXT: .LBB35_1: @ %atomicrmw.start
+; CHECK-THUMB-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-LE-NEXT: ldaexd r3, lr, [r12]
+; CHECK-THUMB-LE-NEXT: mov r4, r0
+; CHECK-THUMB-LE-NEXT: subs r2, r0, r3
+; CHECK-THUMB-LE-NEXT: sbcs.w r2, r1, lr
+; CHECK-THUMB-LE-NEXT: mov.w r2, #0
+; CHECK-THUMB-LE-NEXT: it hs
+; CHECK-THUMB-LE-NEXT: movhs r2, #1
+; CHECK-THUMB-LE-NEXT: cmp r2, #0
+; CHECK-THUMB-LE-NEXT: mov r2, r1
+; CHECK-THUMB-LE-NEXT: itt ne
+; CHECK-THUMB-LE-NEXT: movne r2, lr
+; CHECK-THUMB-LE-NEXT: movne r4, r3
+; CHECK-THUMB-LE-NEXT: stlexd r5, r4, r2, [r12]
+; CHECK-THUMB-LE-NEXT: cmp r5, #0
+; CHECK-THUMB-LE-NEXT: bne .LBB35_1
+; CHECK-THUMB-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-LE-NEXT: strd r3, lr, [r12]
+; CHECK-THUMB-LE-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-THUMB-BE-LABEL: test_atomic_load_umin_i64:
+; CHECK-THUMB-BE: @ %bb.0:
+; CHECK-THUMB-BE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-BE-NEXT: movw r12, :lower16:var64
+; CHECK-THUMB-BE-NEXT: movt r12, :upper16:var64
+; CHECK-THUMB-BE-NEXT: .LBB35_1: @ %atomicrmw.start
+; CHECK-THUMB-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-BE-NEXT: ldaexd r3, lr, [r12]
+; CHECK-THUMB-BE-NEXT: mov r4, r0
+; CHECK-THUMB-BE-NEXT: subs.w r2, r1, lr
+; CHECK-THUMB-BE-NEXT: sbcs.w r2, r0, r3
+; CHECK-THUMB-BE-NEXT: mov.w r2, #0
+; CHECK-THUMB-BE-NEXT: it hs
+; CHECK-THUMB-BE-NEXT: movhs r2, #1
+; CHECK-THUMB-BE-NEXT: cmp r2, #0
+; CHECK-THUMB-BE-NEXT: mov r2, r1
+; CHECK-THUMB-BE-NEXT: itt ne
+; CHECK-THUMB-BE-NEXT: movne r2, lr
+; CHECK-THUMB-BE-NEXT: movne r4, r3
+; CHECK-THUMB-BE-NEXT: stlexd r5, r4, r2, [r12]
+; CHECK-THUMB-BE-NEXT: cmp r5, #0
+; CHECK-THUMB-BE-NEXT: bne .LBB35_1
+; CHECK-THUMB-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-BE-NEXT: strd r3, lr, [r12]
+; CHECK-THUMB-BE-NEXT: pop {r4, r5, r7, pc}
%old = atomicrmw umin ptr @var64, i64 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
-; CHECK: movt r[[ADDR]], :upper16:var64
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaexd [[OLD1:r[0-9]+|lr]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]]
; r0, r1 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-ARM: mov [[MINHI:r[0-9]+]], r1
-; CHECK-ARM-LE: subs {{[^,]+}}, r0, [[OLD1]]
-; CHECK-ARM-LE: sbcs {{[^,]+}}, r1, [[OLD2]]
-; CHECK-ARM-BE: subs {{[^,]+}}, r1, [[OLD2]]
-; CHECK-ARM-BE: sbcs {{[^,]+}}, r0, [[OLD1]]
-; CHECK-ARM: mov [[CMP:r[0-9]+|lr]], #0
-; CHECK-ARM: movwhs [[CMP:r[0-9]+|lr]], #1
-; CHECK-ARM: cmp [[CMP:r[0-9]+|lr]], #0
-; CHECK-ARM: movne [[MINHI]], [[OLD2]]
-; CHECK-ARM: mov [[MINLO:r[0-9]+]], r0
-; CHECK-ARM: movne [[MINLO]], [[OLD1]]
-; CHECK-ARM: stlexd [[STATUS:r[0-9]+]], [[MINLO]], [[MINHI]], [r[[ADDR]]]
-; CHECK-THUMB: stlexd [[STATUS:r[0-9]+]], {{r[0-9]+}}, {{r[0-9]+}}, [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK-ARM: strd [[OLD1]], [[OLD2]], [r[[ADDR]]]
+
store i64 %old, ptr @var64
ret void
}
define i8 @test_atomic_load_umax_i8(i8 zeroext %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_umax_i8:
+; CHECK-ARM-LABEL: test_atomic_load_umax_i8:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var8
+; CHECK-ARM-NEXT: movt r12, :upper16:var8
+; CHECK-ARM-NEXT: .LBB36_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldaexb r1, [r12]
+; CHECK-ARM-NEXT: mov r3, r0
+; CHECK-ARM-NEXT: cmp r1, r0
+; CHECK-ARM-NEXT: movhi r3, r1
+; CHECK-ARM-NEXT: stlexb r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB36_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_umax_i8:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var8
+; CHECK-THUMB-NEXT: movt r12, :upper16:var8
+; CHECK-THUMB-NEXT: .LBB36_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldaexb r1, [r12]
+; CHECK-THUMB-NEXT: mov r3, r0
+; CHECK-THUMB-NEXT: cmp r1, r0
+; CHECK-THUMB-NEXT: it hi
+; CHECK-THUMB-NEXT: movhi r3, r1
+; CHECK-THUMB-NEXT: stlexb r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB36_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw umax ptr @var8, i8 %offset acq_rel
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw [[ADDR:r[0-9]+|lr]], :lower16:var8
-; CHECK: movt [[ADDR]], :upper16:var8
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaexb r[[OLD:[0-9]+]], {{.*}}[[ADDR]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
-; CHECK-NEXT: cmp r[[OLD]], r0
; Thumb mode: it hi
-; CHECK: movhi r[[NEW]], r[[OLD]]
-; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], r[[NEW]], {{.*}}[[ADDR]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i8 %old
}
define i16 @test_atomic_load_umax_i16(i16 zeroext %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_umax_i16:
+; CHECK-ARM-LABEL: test_atomic_load_umax_i16:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var16
+; CHECK-ARM-NEXT: movt r12, :upper16:var16
+; CHECK-ARM-NEXT: .LBB37_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldrexh r1, [r12]
+; CHECK-ARM-NEXT: mov r3, r0
+; CHECK-ARM-NEXT: cmp r1, r0
+; CHECK-ARM-NEXT: movhi r3, r1
+; CHECK-ARM-NEXT: strexh r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB37_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_umax_i16:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var16
+; CHECK-THUMB-NEXT: movt r12, :upper16:var16
+; CHECK-THUMB-NEXT: .LBB37_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrexh r1, [r12]
+; CHECK-THUMB-NEXT: mov r3, r0
+; CHECK-THUMB-NEXT: cmp r1, r0
+; CHECK-THUMB-NEXT: it hi
+; CHECK-THUMB-NEXT: movhi r3, r1
+; CHECK-THUMB-NEXT: strexh r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB37_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw umax ptr @var16, i16 %offset monotonic
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw [[ADDR:r[0-9]+|lr]], :lower16:var16
-; CHECK: movt [[ADDR]], :upper16:var16
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldrexh r[[OLD:[0-9]+]], {{.*}}[[ADDR]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
-; CHECK-NEXT: cmp r[[OLD]], r0
; Thumb mode: it hi
-; CHECK: movhi r[[NEW]], r[[OLD]]
-; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], r[[NEW]], {{.*}}[[ADDR]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i16 %old
}
define i32 @test_atomic_load_umax_i32(i32 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_umax_i32:
+; CHECK-ARM-LABEL: test_atomic_load_umax_i32:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var32
+; CHECK-ARM-NEXT: movt r12, :upper16:var32
+; CHECK-ARM-NEXT: .LBB38_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldaex r1, [r12]
+; CHECK-ARM-NEXT: mov r3, r0
+; CHECK-ARM-NEXT: cmp r1, r0
+; CHECK-ARM-NEXT: movhi r3, r1
+; CHECK-ARM-NEXT: stlex r2, r3, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB38_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_load_umax_i32:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var32
+; CHECK-THUMB-NEXT: movt r12, :upper16:var32
+; CHECK-THUMB-NEXT: .LBB38_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldaex r1, [r12]
+; CHECK-THUMB-NEXT: mov r3, r0
+; CHECK-THUMB-NEXT: cmp r1, r0
+; CHECK-THUMB-NEXT: it hi
+; CHECK-THUMB-NEXT: movhi r3, r1
+; CHECK-THUMB-NEXT: stlex r2, r3, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB38_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: mov r0, r1
+; CHECK-THUMB-NEXT: bx lr
%old = atomicrmw umax ptr @var32, i32 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
-; CHECK: movt r[[ADDR]], :upper16:var32
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
-; CHECK-NEXT: cmp r[[OLD]], r0
; Thumb mode: it hi
-; CHECK: movhi r[[NEW]], r[[OLD]]
-; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: mov r0, r[[OLD]]
+
ret i32 %old
}
define void @test_atomic_load_umax_i64(i64 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_umax_i64:
+; CHECK-ARM-LE-LABEL: test_atomic_load_umax_i64:
+; CHECK-ARM-LE: @ %bb.0:
+; CHECK-ARM-LE-NEXT: push {r4, r5, r6, r7, r11, lr}
+; CHECK-ARM-LE-NEXT: movw r2, :lower16:var64
+; CHECK-ARM-LE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-LE-NEXT: .LBB39_1: @ %atomicrmw.start
+; CHECK-ARM-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-LE-NEXT: ldaexd r4, r5, [r2]
+; CHECK-ARM-LE-NEXT: mov r7, r1
+; CHECK-ARM-LE-NEXT: subs r3, r0, r4
+; CHECK-ARM-LE-NEXT: sbcs r3, r1, r5
+; CHECK-ARM-LE-NEXT: mov r3, #0
+; CHECK-ARM-LE-NEXT: movwlo r3, #1
+; CHECK-ARM-LE-NEXT: cmp r3, #0
+; CHECK-ARM-LE-NEXT: movne r7, r5
+; CHECK-ARM-LE-NEXT: mov r6, r0
+; CHECK-ARM-LE-NEXT: movne r6, r4
+; CHECK-ARM-LE-NEXT: stlexd r3, r6, r7, [r2]
+; CHECK-ARM-LE-NEXT: cmp r3, #0
+; CHECK-ARM-LE-NEXT: bne .LBB39_1
+; CHECK-ARM-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-LE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-LE-NEXT: pop {r4, r5, r6, r7, r11, pc}
+;
+; CHECK-ARM-BE-LABEL: test_atomic_load_umax_i64:
+; CHECK-ARM-BE: @ %bb.0:
+; CHECK-ARM-BE-NEXT: push {r4, r5, r6, r7, r11, lr}
+; CHECK-ARM-BE-NEXT: movw r2, :lower16:var64
+; CHECK-ARM-BE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-BE-NEXT: .LBB39_1: @ %atomicrmw.start
+; CHECK-ARM-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-BE-NEXT: ldaexd r4, r5, [r2]
+; CHECK-ARM-BE-NEXT: mov r7, r1
+; CHECK-ARM-BE-NEXT: subs r3, r1, r5
+; CHECK-ARM-BE-NEXT: sbcs r3, r0, r4
+; CHECK-ARM-BE-NEXT: mov r3, #0
+; CHECK-ARM-BE-NEXT: movwlo r3, #1
+; CHECK-ARM-BE-NEXT: cmp r3, #0
+; CHECK-ARM-BE-NEXT: movne r7, r5
+; CHECK-ARM-BE-NEXT: mov r6, r0
+; CHECK-ARM-BE-NEXT: movne r6, r4
+; CHECK-ARM-BE-NEXT: stlexd r3, r6, r7, [r2]
+; CHECK-ARM-BE-NEXT: cmp r3, #0
+; CHECK-ARM-BE-NEXT: bne .LBB39_1
+; CHECK-ARM-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-BE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-BE-NEXT: pop {r4, r5, r6, r7, r11, pc}
+;
+; CHECK-THUMB-LE-LABEL: test_atomic_load_umax_i64:
+; CHECK-THUMB-LE: @ %bb.0:
+; CHECK-THUMB-LE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-LE-NEXT: movw r12, :lower16:var64
+; CHECK-THUMB-LE-NEXT: movt r12, :upper16:var64
+; CHECK-THUMB-LE-NEXT: .LBB39_1: @ %atomicrmw.start
+; CHECK-THUMB-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-LE-NEXT: ldaexd r3, lr, [r12]
+; CHECK-THUMB-LE-NEXT: mov r4, r0
+; CHECK-THUMB-LE-NEXT: subs r2, r0, r3
+; CHECK-THUMB-LE-NEXT: sbcs.w r2, r1, lr
+; CHECK-THUMB-LE-NEXT: mov.w r2, #0
+; CHECK-THUMB-LE-NEXT: it lo
+; CHECK-THUMB-LE-NEXT: movlo r2, #1
+; CHECK-THUMB-LE-NEXT: cmp r2, #0
+; CHECK-THUMB-LE-NEXT: mov r2, r1
+; CHECK-THUMB-LE-NEXT: itt ne
+; CHECK-THUMB-LE-NEXT: movne r2, lr
+; CHECK-THUMB-LE-NEXT: movne r4, r3
+; CHECK-THUMB-LE-NEXT: stlexd r5, r4, r2, [r12]
+; CHECK-THUMB-LE-NEXT: cmp r5, #0
+; CHECK-THUMB-LE-NEXT: bne .LBB39_1
+; CHECK-THUMB-LE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-LE-NEXT: strd r3, lr, [r12]
+; CHECK-THUMB-LE-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-THUMB-BE-LABEL: test_atomic_load_umax_i64:
+; CHECK-THUMB-BE: @ %bb.0:
+; CHECK-THUMB-BE-NEXT: push {r4, r5, r7, lr}
+; CHECK-THUMB-BE-NEXT: movw r12, :lower16:var64
+; CHECK-THUMB-BE-NEXT: movt r12, :upper16:var64
+; CHECK-THUMB-BE-NEXT: .LBB39_1: @ %atomicrmw.start
+; CHECK-THUMB-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-BE-NEXT: ldaexd r3, lr, [r12]
+; CHECK-THUMB-BE-NEXT: mov r4, r0
+; CHECK-THUMB-BE-NEXT: subs.w r2, r1, lr
+; CHECK-THUMB-BE-NEXT: sbcs.w r2, r0, r3
+; CHECK-THUMB-BE-NEXT: mov.w r2, #0
+; CHECK-THUMB-BE-NEXT: it lo
+; CHECK-THUMB-BE-NEXT: movlo r2, #1
+; CHECK-THUMB-BE-NEXT: cmp r2, #0
+; CHECK-THUMB-BE-NEXT: mov r2, r1
+; CHECK-THUMB-BE-NEXT: itt ne
+; CHECK-THUMB-BE-NEXT: movne r2, lr
+; CHECK-THUMB-BE-NEXT: movne r4, r3
+; CHECK-THUMB-BE-NEXT: stlexd r5, r4, r2, [r12]
+; CHECK-THUMB-BE-NEXT: cmp r5, #0
+; CHECK-THUMB-BE-NEXT: bne .LBB39_1
+; CHECK-THUMB-BE-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-BE-NEXT: strd r3, lr, [r12]
+; CHECK-THUMB-BE-NEXT: pop {r4, r5, r7, pc}
%old = atomicrmw umax ptr @var64, i64 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
-; CHECK: movt r[[ADDR]], :upper16:var64
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaexd [[OLD1:r[0-9]+|lr]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]]
; r0, r1 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-ARM: mov [[MINHI:r[0-9]+]], r1
-; CHECK-ARM-LE: subs {{[^,]+}}, r0, [[OLD1]]
-; CHECK-ARM-LE: sbcs {{[^,]+}}, r1, [[OLD2]]
-; CHECK-ARM-BE: subs {{[^,]+}}, r1, [[OLD2]]
-; CHECK-ARM-BE: sbcs {{[^,]+}}, r0, [[OLD1]]
-; CHECK-ARM: mov [[CMP:r[0-9]+|lr]], #0
-; CHECK-ARM: movwlo [[CMP:r[0-9]+|lr]], #1
-; CHECK-ARM: cmp [[CMP:r[0-9]+|lr]], #0
-; CHECK-ARM: movne [[MINHI]], [[OLD2]]
-; CHECK-ARM: mov [[MINLO:r[0-9]+]], r0
-; CHECK-ARM: movne [[MINLO]], [[OLD1]]
-; CHECK-ARM: stlexd [[STATUS:r[0-9]+]], [[MINLO]], [[MINHI]], [r[[ADDR]]]
-; CHECK-THUMB: stlexd [[STATUS:r[0-9]+]], {{r[0-9]+}}, {{r[0-9]+}}, [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK-ARM: strd [[OLD1]], [[OLD2]], [r[[ADDR]]]
+
store i64 %old, ptr @var64
ret void
}
define i8 @test_atomic_cmpxchg_i8(i8 zeroext %wanted, i8 zeroext %new) nounwind {
-; CHECK-LABEL: test_atomic_cmpxchg_i8:
+; CHECK-ARM-LABEL: test_atomic_cmpxchg_i8:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var8
+; CHECK-ARM-NEXT: movt r12, :upper16:var8
+; CHECK-ARM-NEXT: .LBB40_1: @ %cmpxchg.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldaexb r2, [r12]
+; CHECK-ARM-NEXT: cmp r2, r0
+; CHECK-ARM-NEXT: bne .LBB40_4
+; CHECK-ARM-NEXT: @ %bb.2: @ %cmpxchg.trystore
+; CHECK-ARM-NEXT: @ in Loop: Header=BB40_1 Depth=1
+; CHECK-ARM-NEXT: strexb r3, r1, [r12]
+; CHECK-ARM-NEXT: cmp r3, #0
+; CHECK-ARM-NEXT: bne .LBB40_1
+; CHECK-ARM-NEXT: @ %bb.3: @ %cmpxchg.end
+; CHECK-ARM-NEXT: mov r0, r2
+; CHECK-ARM-NEXT: bx lr
+; CHECK-ARM-NEXT: .LBB40_4: @ %cmpxchg.nostore
+; CHECK-ARM-NEXT: clrex
+; CHECK-ARM-NEXT: mov r0, r2
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_cmpxchg_i8:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var8
+; CHECK-THUMB-NEXT: mov r2, r0
+; CHECK-THUMB-NEXT: movt r12, :upper16:var8
+; CHECK-THUMB-NEXT: .LBB40_1: @ %cmpxchg.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldaexb r0, [r12]
+; CHECK-THUMB-NEXT: cmp r0, r2
+; CHECK-THUMB-NEXT: bne .LBB40_3
+; CHECK-THUMB-NEXT: @ %bb.2: @ %cmpxchg.trystore
+; CHECK-THUMB-NEXT: @ in Loop: Header=BB40_1 Depth=1
+; CHECK-THUMB-NEXT: strexb r3, r1, [r12]
+; CHECK-THUMB-NEXT: cmp r3, #0
+; CHECK-THUMB-NEXT: it eq
+; CHECK-THUMB-NEXT: bxeq lr
+; CHECK-THUMB-NEXT: b .LBB40_1
+; CHECK-THUMB-NEXT: .LBB40_3: @ %cmpxchg.nostore
+; CHECK-THUMB-NEXT: clrex
+; CHECK-THUMB-NEXT: bx lr
%pair = cmpxchg ptr @var8, i8 %wanted, i8 %new acquire acquire
%old = extractvalue { i8, i1 } %pair, 0
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK-DAG: movw r[[ADDR:[0-9]+]], :lower16:var8
-; CHECK-DAG: movt r[[ADDR]], :upper16:var8
-; CHECK-THUMB-DAG: mov r[[WANTED:[0-9]+]], r0
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-ARM-NEXT: cmp r[[OLD]], r0
-; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]]
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_{{[0-9]}}
-; CHECK-NEXT: %bb.2:
; As above, r1 is a reasonable guess.
-; CHECK: strexb [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-ARM-NEXT: bne .LBB{{[0-9]+}}_{{[0-9]}}
-; CHECK-THUMB-NEXT: it eq
-; CHECK-THUMB-NEXT: bxeq lr
-; CHECK-ARM: mov r0, r[[OLD]]
-; CHECK-ARM: clrex
-; CHECK: bx lr
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
ret i8 %old
}
define i16 @test_atomic_cmpxchg_i16(i16 zeroext %wanted, i16 zeroext %new) nounwind {
-; CHECK-LABEL: test_atomic_cmpxchg_i16:
+; CHECK-ARM-LABEL: test_atomic_cmpxchg_i16:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var16
+; CHECK-ARM-NEXT: movt r12, :upper16:var16
+; CHECK-ARM-NEXT: .LBB41_1: @ %cmpxchg.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldaexh r2, [r12]
+; CHECK-ARM-NEXT: cmp r2, r0
+; CHECK-ARM-NEXT: bne .LBB41_4
+; CHECK-ARM-NEXT: @ %bb.2: @ %cmpxchg.trystore
+; CHECK-ARM-NEXT: @ in Loop: Header=BB41_1 Depth=1
+; CHECK-ARM-NEXT: stlexh r3, r1, [r12]
+; CHECK-ARM-NEXT: cmp r3, #0
+; CHECK-ARM-NEXT: bne .LBB41_1
+; CHECK-ARM-NEXT: @ %bb.3: @ %cmpxchg.end
+; CHECK-ARM-NEXT: mov r0, r2
+; CHECK-ARM-NEXT: bx lr
+; CHECK-ARM-NEXT: .LBB41_4: @ %cmpxchg.nostore
+; CHECK-ARM-NEXT: clrex
+; CHECK-ARM-NEXT: mov r0, r2
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_cmpxchg_i16:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var16
+; CHECK-THUMB-NEXT: mov r2, r0
+; CHECK-THUMB-NEXT: movt r12, :upper16:var16
+; CHECK-THUMB-NEXT: .LBB41_1: @ %cmpxchg.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldaexh r0, [r12]
+; CHECK-THUMB-NEXT: cmp r0, r2
+; CHECK-THUMB-NEXT: bne .LBB41_3
+; CHECK-THUMB-NEXT: @ %bb.2: @ %cmpxchg.trystore
+; CHECK-THUMB-NEXT: @ in Loop: Header=BB41_1 Depth=1
+; CHECK-THUMB-NEXT: stlexh r3, r1, [r12]
+; CHECK-THUMB-NEXT: cmp r3, #0
+; CHECK-THUMB-NEXT: it eq
+; CHECK-THUMB-NEXT: bxeq lr
+; CHECK-THUMB-NEXT: b .LBB41_1
+; CHECK-THUMB-NEXT: .LBB41_3: @ %cmpxchg.nostore
+; CHECK-THUMB-NEXT: clrex
+; CHECK-THUMB-NEXT: bx lr
%pair = cmpxchg ptr @var16, i16 %wanted, i16 %new seq_cst seq_cst
%old = extractvalue { i16, i1 } %pair, 0
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK-DAG: movw r[[ADDR:[0-9]+]], :lower16:var16
-; CHECK-DAG: movt r[[ADDR]], :upper16:var16
-; CHECK-THUMB-DAG: mov r[[WANTED:[0-9]+]], r0
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-ARM-NEXT: cmp r[[OLD]], r0
-; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]]
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_{{[0-9]}}
-; CHECK-NEXT: %bb.2:
; As above, r1 is a reasonable guess.
-; CHECK: stlexh [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-ARM-NEXT: bne .LBB{{[0-9]+}}_{{[0-9]}}
-; CHECK-THUMB-NEXT: it eq
-; CHECK-THUMB-NEXT: bxeq lr
-; CHECK-ARM: mov r0, r[[OLD]]
-; CHECK: bx lr
-; CHECK-ARM-NEXT: .LBB{{[0-9]+}}_{{[0-9]}}
-; CHECK-ARM-NEXT: clrex
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK-ARM: mov r0, r[[OLD]]
-; CHECK-ARM-NEXT: bx lr
+
ret i16 %old
}
define void @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
-; CHECK-LABEL: test_atomic_cmpxchg_i32:
+; CHECK-ARM-LABEL: test_atomic_cmpxchg_i32:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: movw r12, :lower16:var32
+; CHECK-ARM-NEXT: movt r12, :upper16:var32
+; CHECK-ARM-NEXT: .LBB42_1: @ %cmpxchg.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldrex r3, [r12]
+; CHECK-ARM-NEXT: cmp r3, r0
+; CHECK-ARM-NEXT: bne .LBB42_4
+; CHECK-ARM-NEXT: @ %bb.2: @ %cmpxchg.trystore
+; CHECK-ARM-NEXT: @ in Loop: Header=BB42_1 Depth=1
+; CHECK-ARM-NEXT: stlex r2, r1, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
+; CHECK-ARM-NEXT: bne .LBB42_1
+; CHECK-ARM-NEXT: @ %bb.3: @ %cmpxchg.end
+; CHECK-ARM-NEXT: str r3, [r12]
+; CHECK-ARM-NEXT: bx lr
+; CHECK-ARM-NEXT: .LBB42_4: @ %cmpxchg.nostore
+; CHECK-ARM-NEXT: clrex
+; CHECK-ARM-NEXT: str r3, [r12]
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: test_atomic_cmpxchg_i32:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var32
+; CHECK-THUMB-NEXT: movt r12, :upper16:var32
+; CHECK-THUMB-NEXT: .LBB42_1: @ %cmpxchg.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrex r3, [r12]
+; CHECK-THUMB-NEXT: cmp r3, r0
+; CHECK-THUMB-NEXT: bne .LBB42_4
+; CHECK-THUMB-NEXT: @ %bb.2: @ %cmpxchg.trystore
+; CHECK-THUMB-NEXT: @ in Loop: Header=BB42_1 Depth=1
+; CHECK-THUMB-NEXT: stlex r2, r1, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB42_1
+; CHECK-THUMB-NEXT: @ %bb.3: @ %cmpxchg.end
+; CHECK-THUMB-NEXT: str.w r3, [r12]
+; CHECK-THUMB-NEXT: bx lr
+; CHECK-THUMB-NEXT: .LBB42_4: @ %cmpxchg.nostore
+; CHECK-THUMB-NEXT: clrex
+; CHECK-THUMB-NEXT: str.w r3, [r12]
+; CHECK-THUMB-NEXT: bx lr
%pair = cmpxchg ptr @var32, i32 %wanted, i32 %new release monotonic
%old = extractvalue { i32, i1 } %pair, 0
store i32 %old, ptr @var32
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
-; CHECK: movt r[[ADDR]], :upper16:var32
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
; r0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp r[[OLD]], r0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
-; CHECK-NEXT: %bb.2:
; As above, r1 is a reasonable guess.
-; CHECK: stlex [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK: str{{(.w)?}} r[[OLD]],
-; CHECK-NEXT: bx lr
-; CHECK-NEXT: .LBB{{[0-9]+}}_4:
-; CHECK-NEXT: clrex
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK: str{{(.w)?}} r[[OLD]],
-; CHECK-ARM-NEXT: bx lr
+
ret void
}
define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
-; CHECK-LABEL: test_atomic_cmpxchg_i64:
+; CHECK-ARM-LE-LABEL: test_atomic_cmpxchg_i64:
+; CHECK-ARM-LE: @ %bb.0:
+; CHECK-ARM-LE-NEXT: push {r4, r5, r6, lr}
+; CHECK-ARM-LE-NEXT: movw r12, :lower16:var64
+; CHECK-ARM-LE-NEXT: @ kill: def $r3 killed $r3 killed $r2_r3 def $r2_r3
+; CHECK-ARM-LE-NEXT: movt r12, :upper16:var64
+; CHECK-ARM-LE-NEXT: @ kill: def $r2 killed $r2 killed $r2_r3 def $r2_r3
+; CHECK-ARM-LE-NEXT: .LBB43_1: @ %cmpxchg.start
+; CHECK-ARM-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-LE-NEXT: ldrexd r4, r5, [r12]
+; CHECK-ARM-LE-NEXT: eor lr, r5, r1
+; CHECK-ARM-LE-NEXT: eor r6, r4, r0
+; CHECK-ARM-LE-NEXT: orrs r6, r6, lr
+; CHECK-ARM-LE-NEXT: bne .LBB43_4
+; CHECK-ARM-LE-NEXT: @ %bb.2: @ %cmpxchg.trystore
+; CHECK-ARM-LE-NEXT: @ in Loop: Header=BB43_1 Depth=1
+; CHECK-ARM-LE-NEXT: strexd r6, r2, r3, [r12]
+; CHECK-ARM-LE-NEXT: cmp r6, #0
+; CHECK-ARM-LE-NEXT: bne .LBB43_1
+; CHECK-ARM-LE-NEXT: @ %bb.3: @ %cmpxchg.end
+; CHECK-ARM-LE-NEXT: strd r4, r5, [r12]
+; CHECK-ARM-LE-NEXT: pop {r4, r5, r6, pc}
+; CHECK-ARM-LE-NEXT: .LBB43_4: @ %cmpxchg.nostore
+; CHECK-ARM-LE-NEXT: clrex
+; CHECK-ARM-LE-NEXT: strd r4, r5, [r12]
+; CHECK-ARM-LE-NEXT: pop {r4, r5, r6, pc}
+;
+; CHECK-ARM-BE-LABEL: test_atomic_cmpxchg_i64:
+; CHECK-ARM-BE: @ %bb.0:
+; CHECK-ARM-BE-NEXT: push {r4, r5, r6, lr}
+; CHECK-ARM-BE-NEXT: movw r12, :lower16:var64
+; CHECK-ARM-BE-NEXT: @ kill: def $r3 killed $r3 killed $r2_r3 def $r2_r3
+; CHECK-ARM-BE-NEXT: movt r12, :upper16:var64
+; CHECK-ARM-BE-NEXT: @ kill: def $r2 killed $r2 killed $r2_r3 def $r2_r3
+; CHECK-ARM-BE-NEXT: .LBB43_1: @ %cmpxchg.start
+; CHECK-ARM-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-BE-NEXT: ldrexd r4, r5, [r12]
+; CHECK-ARM-BE-NEXT: eor lr, r4, r0
+; CHECK-ARM-BE-NEXT: eor r6, r5, r1
+; CHECK-ARM-BE-NEXT: orrs r6, r6, lr
+; CHECK-ARM-BE-NEXT: bne .LBB43_4
+; CHECK-ARM-BE-NEXT: @ %bb.2: @ %cmpxchg.trystore
+; CHECK-ARM-BE-NEXT: @ in Loop: Header=BB43_1 Depth=1
+; CHECK-ARM-BE-NEXT: strexd r6, r2, r3, [r12]
+; CHECK-ARM-BE-NEXT: cmp r6, #0
+; CHECK-ARM-BE-NEXT: bne .LBB43_1
+; CHECK-ARM-BE-NEXT: @ %bb.3: @ %cmpxchg.end
+; CHECK-ARM-BE-NEXT: strd r4, r5, [r12]
+; CHECK-ARM-BE-NEXT: pop {r4, r5, r6, pc}
+; CHECK-ARM-BE-NEXT: .LBB43_4: @ %cmpxchg.nostore
+; CHECK-ARM-BE-NEXT: clrex
+; CHECK-ARM-BE-NEXT: strd r4, r5, [r12]
+; CHECK-ARM-BE-NEXT: pop {r4, r5, r6, pc}
+;
+; CHECK-THUMB-LE-LABEL: test_atomic_cmpxchg_i64:
+; CHECK-THUMB-LE: @ %bb.0:
+; CHECK-THUMB-LE-NEXT: push {r4, r5, r6, lr}
+; CHECK-THUMB-LE-NEXT: movw r12, :lower16:var64
+; CHECK-THUMB-LE-NEXT: movt r12, :upper16:var64
+; CHECK-THUMB-LE-NEXT: .LBB43_1: @ %cmpxchg.start
+; CHECK-THUMB-LE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-LE-NEXT: ldrexd lr, r4, [r12]
+; CHECK-THUMB-LE-NEXT: eor.w r5, r4, r1
+; CHECK-THUMB-LE-NEXT: eor.w r6, lr, r0
+; CHECK-THUMB-LE-NEXT: orrs r5, r6
+; CHECK-THUMB-LE-NEXT: bne .LBB43_4
+; CHECK-THUMB-LE-NEXT: @ %bb.2: @ %cmpxchg.trystore
+; CHECK-THUMB-LE-NEXT: @ in Loop: Header=BB43_1 Depth=1
+; CHECK-THUMB-LE-NEXT: strexd r5, r2, r3, [r12]
+; CHECK-THUMB-LE-NEXT: cmp r5, #0
+; CHECK-THUMB-LE-NEXT: bne .LBB43_1
+; CHECK-THUMB-LE-NEXT: @ %bb.3: @ %cmpxchg.end
+; CHECK-THUMB-LE-NEXT: strd lr, r4, [r12]
+; CHECK-THUMB-LE-NEXT: pop {r4, r5, r6, pc}
+; CHECK-THUMB-LE-NEXT: .LBB43_4: @ %cmpxchg.nostore
+; CHECK-THUMB-LE-NEXT: clrex
+; CHECK-THUMB-LE-NEXT: strd lr, r4, [r12]
+; CHECK-THUMB-LE-NEXT: pop {r4, r5, r6, pc}
+;
+; CHECK-THUMB-BE-LABEL: test_atomic_cmpxchg_i64:
+; CHECK-THUMB-BE: @ %bb.0:
+; CHECK-THUMB-BE-NEXT: push {r4, r5, r6, lr}
+; CHECK-THUMB-BE-NEXT: movw r12, :lower16:var64
+; CHECK-THUMB-BE-NEXT: movt r12, :upper16:var64
+; CHECK-THUMB-BE-NEXT: .LBB43_1: @ %cmpxchg.start
+; CHECK-THUMB-BE-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-BE-NEXT: ldrexd lr, r4, [r12]
+; CHECK-THUMB-BE-NEXT: eor.w r5, lr, r0
+; CHECK-THUMB-BE-NEXT: eor.w r6, r4, r1
+; CHECK-THUMB-BE-NEXT: orrs r5, r6
+; CHECK-THUMB-BE-NEXT: bne .LBB43_4
+; CHECK-THUMB-BE-NEXT: @ %bb.2: @ %cmpxchg.trystore
+; CHECK-THUMB-BE-NEXT: @ in Loop: Header=BB43_1 Depth=1
+; CHECK-THUMB-BE-NEXT: strexd r5, r2, r3, [r12]
+; CHECK-THUMB-BE-NEXT: cmp r5, #0
+; CHECK-THUMB-BE-NEXT: bne .LBB43_1
+; CHECK-THUMB-BE-NEXT: @ %bb.3: @ %cmpxchg.end
+; CHECK-THUMB-BE-NEXT: strd lr, r4, [r12]
+; CHECK-THUMB-BE-NEXT: pop {r4, r5, r6, pc}
+; CHECK-THUMB-BE-NEXT: .LBB43_4: @ %cmpxchg.nostore
+; CHECK-THUMB-BE-NEXT: clrex
+; CHECK-THUMB-BE-NEXT: strd lr, r4, [r12]
+; CHECK-THUMB-BE-NEXT: pop {r4, r5, r6, pc}
%pair = cmpxchg ptr @var64, i64 %wanted, i64 %new monotonic monotonic
%old = extractvalue { i64, i1 } %pair, 0
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
-; CHECK: movt r[[ADDR]], :upper16:var64
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldrexd [[OLD1:r[0-9]+|lr]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]]
; r0, r1 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-LE-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0
-; CHECK-LE-DAG: eor{{(\.w)?}} [[MISMATCH_HI:r[0-9]+|lr]], [[OLD2]], r1
-; CHECK-ARM-LE: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_LO]], [[MISMATCH_HI]]
-; CHECK-THUMB-LE: orrs{{(\.w)?}} {{(r[0-9]+, )?}}[[MISMATCH_HI]], [[MISMATCH_LO]]
-; CHECK-BE-DAG: eor{{(\.w)?}} [[MISMATCH_HI:r[0-9]+|lr]], [[OLD2]], r1
-; CHECK-BE-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0
-; CHECK-ARM-BE: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_HI]], [[MISMATCH_LO]]
-; CHECK-THUMB-BE: orrs{{(\.w)?}} {{(r[0-9]+, )?}}[[MISMATCH_LO]], [[MISMATCH_HI]]
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
-; CHECK-NEXT: %bb.2:
; As above, r2, r3 is a reasonable guess.
-; CHECK: strexd [[STATUS:r[0-9]+]], r2, r3, [r[[ADDR]]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK: strd [[OLD1]], [[OLD2]], [r[[ADDR]]]
-; CHECK-NEXT: pop
-; CHECK-NEXT: .LBB{{[0-9]+}}_4:
-; CHECK-NEXT: clrex
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-
-; CHECK-ARM: strd [[OLD1]], [[OLD2]], [r[[ADDR]]]
+
store i64 %old, ptr @var64
ret void
}
define i8 @test_atomic_load_monotonic_i8() nounwind {
; CHECK-LABEL: test_atomic_load_monotonic_i8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: movw r0, :lower16:var8
+; CHECK-NEXT: movt r0, :upper16:var8
+; CHECK-NEXT: ldrb r0, [r0]
+; CHECK-NEXT: bx lr
%val = load atomic i8, ptr @var8 monotonic, align 1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
-; CHECK: movt r[[ADDR]], :upper16:var8
-; CHECK: ldrb r0, [r[[ADDR]]]
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
ret i8 %val
}
define i8 @test_atomic_load_monotonic_regoff_i8(i64 %base, i64 %off) nounwind {
-; CHECK-LABEL: test_atomic_load_monotonic_regoff_i8:
+; CHECK-LE-LABEL: test_atomic_load_monotonic_regoff_i8:
+; CHECK-LE: @ %bb.0:
+; CHECK-LE-NEXT: ldrb r0, [r0, r2]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: test_atomic_load_monotonic_regoff_i8:
+; CHECK-BE: @ %bb.0:
+; CHECK-BE-NEXT: ldrb r0, [r1, r3]
+; CHECK-BE-NEXT: bx lr
%addr_int = add i64 %base, %off
%addr = inttoptr i64 %addr_int to ptr
%val = load atomic i8, ptr %addr monotonic, align 1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK-LE: ldrb r0, [r0, r2]
-; CHECK-BE: ldrb r0, [r1, r3]
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
ret i8 %val
}
define i8 @test_atomic_load_acquire_i8() nounwind {
; CHECK-LABEL: test_atomic_load_acquire_i8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: movw r0, :lower16:var8
+; CHECK-NEXT: movt r0, :upper16:var8
+; CHECK-NEXT: ldab r0, [r0]
+; CHECK-NEXT: bx lr
%val = load atomic i8, ptr @var8 acquire, align 1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movt r[[ADDR]], :upper16:var8
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: ldab r0, [r[[ADDR]]]
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
ret i8 %val
}
define i8 @test_atomic_load_seq_cst_i8() nounwind {
; CHECK-LABEL: test_atomic_load_seq_cst_i8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: movw r0, :lower16:var8
+; CHECK-NEXT: movt r0, :upper16:var8
+; CHECK-NEXT: ldab r0, [r0]
+; CHECK-NEXT: bx lr
%val = load atomic i8, ptr @var8 seq_cst, align 1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movt r[[ADDR]], :upper16:var8
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: ldab r0, [r[[ADDR]]]
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
ret i8 %val
}
define i16 @test_atomic_load_monotonic_i16() nounwind {
; CHECK-LABEL: test_atomic_load_monotonic_i16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: movw r0, :lower16:var16
+; CHECK-NEXT: movt r0, :upper16:var16
+; CHECK-NEXT: ldrh r0, [r0]
+; CHECK-NEXT: bx lr
%val = load atomic i16, ptr @var16 monotonic, align 2
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movt r[[ADDR]], :upper16:var16
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: ldrh r0, [r[[ADDR]]]
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
ret i16 %val
}
define i32 @test_atomic_load_monotonic_regoff_i32(i64 %base, i64 %off) nounwind {
-; CHECK-LABEL: test_atomic_load_monotonic_regoff_i32:
+; CHECK-LE-LABEL: test_atomic_load_monotonic_regoff_i32:
+; CHECK-LE: @ %bb.0:
+; CHECK-LE-NEXT: ldr r0, [r0, r2]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: test_atomic_load_monotonic_regoff_i32:
+; CHECK-BE: @ %bb.0:
+; CHECK-BE-NEXT: ldr r0, [r1, r3]
+; CHECK-BE-NEXT: bx lr
%addr_int = add i64 %base, %off
%addr = inttoptr i64 %addr_int to ptr
%val = load atomic i32, ptr %addr monotonic, align 4
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK-LE: ldr r0, [r0, r2]
-; CHECK-BE: ldr r0, [r1, r3]
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
ret i32 %val
}
define i64 @test_atomic_load_seq_cst_i64() nounwind {
; CHECK-LABEL: test_atomic_load_seq_cst_i64:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: movw r0, :lower16:var64
+; CHECK-NEXT: movt r0, :upper16:var64
+; CHECK-NEXT: ldaexd r0, r1, [r0]
+; CHECK-NEXT: clrex
+; CHECK-NEXT: bx lr
%val = load atomic i64, ptr @var64 seq_cst, align 8
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movt r[[ADDR]], :upper16:var64
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: ldaexd r0, r1, [r[[ADDR]]]
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
ret i64 %val
}
define void @test_atomic_store_monotonic_i8(i8 %val) nounwind {
; CHECK-LABEL: test_atomic_store_monotonic_i8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: movw r1, :lower16:var8
+; CHECK-NEXT: movt r1, :upper16:var8
+; CHECK-NEXT: strb r0, [r1]
+; CHECK-NEXT: bx lr
store atomic i8 %val, ptr @var8 monotonic, align 1
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
-; CHECK: movt r[[ADDR]], :upper16:var8
-; CHECK: strb r0, [r[[ADDR]]]
ret void
}
define void @test_atomic_store_monotonic_regoff_i8(i64 %base, i64 %off, i8 %val) nounwind {
-; CHECK-LABEL: test_atomic_store_monotonic_regoff_i8:
+; CHECK-ARM-LE-LABEL: test_atomic_store_monotonic_regoff_i8:
+; CHECK-ARM-LE: @ %bb.0:
+; CHECK-ARM-LE-NEXT: ldrb r1, [sp]
+; CHECK-ARM-LE-NEXT: strb r1, [r0, r2]
+; CHECK-ARM-LE-NEXT: bx lr
+;
+; CHECK-ARM-BE-LABEL: test_atomic_store_monotonic_regoff_i8:
+; CHECK-ARM-BE: @ %bb.0:
+; CHECK-ARM-BE-NEXT: ldrb r0, [sp, #3]
+; CHECK-ARM-BE-NEXT: strb r0, [r1, r3]
+; CHECK-ARM-BE-NEXT: bx lr
+;
+; CHECK-THUMB-LE-LABEL: test_atomic_store_monotonic_regoff_i8:
+; CHECK-THUMB-LE: @ %bb.0:
+; CHECK-THUMB-LE-NEXT: ldrb.w r1, [sp]
+; CHECK-THUMB-LE-NEXT: strb r1, [r0, r2]
+; CHECK-THUMB-LE-NEXT: bx lr
+;
+; CHECK-THUMB-BE-LABEL: test_atomic_store_monotonic_regoff_i8:
+; CHECK-THUMB-BE: @ %bb.0:
+; CHECK-THUMB-BE-NEXT: ldrb.w r0, [sp, #3]
+; CHECK-THUMB-BE-NEXT: strb r0, [r1, r3]
+; CHECK-THUMB-BE-NEXT: bx lr
%addr_int = add i64 %base, %off
%addr = inttoptr i64 %addr_int to ptr
store atomic i8 %val, ptr %addr monotonic, align 1
-; CHECK-LE: ldr{{b?(\.w)?}} [[VAL:r[0-9]+]], [sp]
-; CHECK-LE: strb [[VAL]], [r0, r2]
-; CHECK-BE: ldrb{{(\.w)?}} [[VAL:r[0-9]+]], [sp, #3]
-; CHECK-BE: strb [[VAL]], [r1, r3]
ret void
}
define void @test_atomic_store_release_i8(i8 %val) nounwind {
; CHECK-LABEL: test_atomic_store_release_i8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: movw r1, :lower16:var8
+; CHECK-NEXT: movt r1, :upper16:var8
+; CHECK-NEXT: stlb r0, [r1]
+; CHECK-NEXT: bx lr
store atomic i8 %val, ptr @var8 release, align 1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movt r[[ADDR]], :upper16:var8
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: stlb r0, [r[[ADDR]]]
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
ret void
}
define void @test_atomic_store_seq_cst_i8(i8 %val) nounwind {
; CHECK-LABEL: test_atomic_store_seq_cst_i8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: movw r1, :lower16:var8
+; CHECK-NEXT: movt r1, :upper16:var8
+; CHECK-NEXT: stlb r0, [r1]
+; CHECK-NEXT: bx lr
store atomic i8 %val, ptr @var8 seq_cst, align 1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movt r[[ADDR]], :upper16:var8
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: stlb r0, [r[[ADDR]]]
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
ret void
}
define void @test_atomic_store_monotonic_i16(i16 %val) nounwind {
; CHECK-LABEL: test_atomic_store_monotonic_i16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: movw r1, :lower16:var16
+; CHECK-NEXT: movt r1, :upper16:var16
+; CHECK-NEXT: strh r0, [r1]
+; CHECK-NEXT: bx lr
store atomic i16 %val, ptr @var16 monotonic, align 2
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movt r[[ADDR]], :upper16:var16
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: strh r0, [r[[ADDR]]]
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
ret void
}
define void @test_atomic_store_monotonic_regoff_i32(i64 %base, i64 %off, i32 %val) nounwind {
-; CHECK-LABEL: test_atomic_store_monotonic_regoff_i32:
+; CHECK-LE-LABEL: test_atomic_store_monotonic_regoff_i32:
+; CHECK-LE: @ %bb.0:
+; CHECK-LE-NEXT: ldr r1, [sp]
+; CHECK-LE-NEXT: str r1, [r0, r2]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: test_atomic_store_monotonic_regoff_i32:
+; CHECK-BE: @ %bb.0:
+; CHECK-BE-NEXT: ldr r0, [sp]
+; CHECK-BE-NEXT: str r0, [r1, r3]
+; CHECK-BE-NEXT: bx lr
%addr_int = add i64 %base, %off
%addr = inttoptr i64 %addr_int to ptr
store atomic i32 %val, ptr %addr monotonic, align 4
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: ldr [[VAL:r[0-9]+]], [sp]
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK-LE: str [[VAL]], [r0, r2]
-; CHECK-BE: str [[VAL]], [r1, r3]
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
ret void
}
define void @test_atomic_store_release_i64(i64 %val) nounwind {
-; CHECK-LABEL: test_atomic_store_release_i64:
+; CHECK-ARM-LABEL: test_atomic_store_release_i64:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: push {r4, r5, r11, lr}
+; CHECK-ARM-NEXT: movw r2, :lower16:var64
+; CHECK-ARM-NEXT: @ kill: def $r1 killed $r1 killed $r0_r1 def $r0_r1
+; CHECK-ARM-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-NEXT: @ kill: def $r0 killed $r0 killed $r0_r1 def $r0_r1
+; CHECK-ARM-NEXT: .LBB57_1: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldrexd r4, r5, [r2]
+; CHECK-ARM-NEXT: stlexd r3, r0, r1, [r2]
+; CHECK-ARM-NEXT: cmp r3, #0
+; CHECK-ARM-NEXT: bne .LBB57_1
+; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-ARM-NEXT: pop {r4, r5, r11, pc}
+;
+; CHECK-THUMB-LABEL: test_atomic_store_release_i64:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: movw r12, :lower16:var64
+; CHECK-THUMB-NEXT: movt r12, :upper16:var64
+; CHECK-THUMB-NEXT: .LBB57_1: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrexd r3, r2, [r12]
+; CHECK-THUMB-NEXT: stlexd r2, r0, r1, [r12]
+; CHECK-THUMB-NEXT: cmp r2, #0
+; CHECK-THUMB-NEXT: bne .LBB57_1
+; CHECK-THUMB-NEXT: @ %bb.2: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: bx lr
store atomic i64 %val, ptr @var64 release, align 8
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
-; CHECK: movw [[ADDR:r[0-9]+|lr]], :lower16:var64
-; CHECK: movt [[ADDR]], :upper16:var64
-; CHECK: .LBB{{[0-9]+}}_1:
; r0, r1 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK: stlexd [[STATUS:r[0-9]+]], r0, r1, {{.*}}[[ADDR]]
-; CHECK-NEXT: cmp [[STATUS]], #0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-; CHECK-NOT: mcr
ret void
}
-define i32 @not.barriers(ptr %var, i1 %cond) {
-; CHECK-LABEL: not.barriers:
+define i32 @not_barriers(ptr %var, i1 %cond) {
+; CHECK-ARM-LABEL: not_barriers:
+; CHECK-ARM: @ %bb.0:
+; CHECK-ARM-NEXT: tst r1, #1
+; CHECK-ARM-NEXT: beq .LBB58_4
+; CHECK-ARM-NEXT: @ %bb.1: @ %atomic_ver
+; CHECK-ARM-NEXT: dmb ish
+; CHECK-ARM-NEXT: .LBB58_2: @ %atomicrmw.start
+; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-ARM-NEXT: ldrex r1, [r0]
+; CHECK-ARM-NEXT: sub r2, r1, #1
+; CHECK-ARM-NEXT: strex r3, r2, [r0]
+; CHECK-ARM-NEXT: cmp r3, #0
+; CHECK-ARM-NEXT: bne .LBB58_2
+; CHECK-ARM-NEXT: @ %bb.3: @ %atomicrmw.end
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: dmb ish
+; CHECK-ARM-NEXT: bx lr
+; CHECK-ARM-NEXT: .LBB58_4: @ %simple_ver
+; CHECK-ARM-NEXT: ldr r1, [r0]
+; CHECK-ARM-NEXT: sub r1, r1, #1
+; CHECK-ARM-NEXT: str r1, [r0]
+; CHECK-ARM-NEXT: mov r0, r1
+; CHECK-ARM-NEXT: bx lr
+;
+; CHECK-THUMB-LABEL: not_barriers:
+; CHECK-THUMB: @ %bb.0:
+; CHECK-THUMB-NEXT: mov r2, r0
+; CHECK-THUMB-NEXT: lsls r0, r1, #31
+; CHECK-THUMB-NEXT: itttt eq
+; CHECK-THUMB-NEXT: ldreq r0, [r2]
+; CHECK-THUMB-NEXT: subeq r0, #1
+; CHECK-THUMB-NEXT: streq r0, [r2]
+; CHECK-THUMB-NEXT: bxeq lr
+; CHECK-THUMB-NEXT: .LBB58_1: @ %atomic_ver
+; CHECK-THUMB-NEXT: dmb ish
+; CHECK-THUMB-NEXT: .LBB58_2: @ %atomicrmw.start
+; CHECK-THUMB-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-THUMB-NEXT: ldrex r0, [r2]
+; CHECK-THUMB-NEXT: subs r1, r0, #1
+; CHECK-THUMB-NEXT: strex r3, r1, [r2]
+; CHECK-THUMB-NEXT: cmp r3, #0
+; CHECK-THUMB-NEXT: bne .LBB58_2
+; CHECK-THUMB-NEXT: @ %bb.3: @ %atomicrmw.end
+; CHECK-THUMB-NEXT: dmb ish
+; CHECK-THUMB-NEXT: bx lr
br i1 %cond, label %atomic_ver, label %simple_ver
simple_ver:
%oldval = load i32, ptr %var
@@ -1413,13 +2503,9 @@ atomic_ver:
%val = atomicrmw add ptr %var, i32 -1 monotonic
fence seq_cst
br label %somewhere
-; CHECK: dmb
-; CHECK: ldrex
-; CHECK: dmb
; The key point here is that the second dmb isn't immediately followed by the
; simple_ver basic block, which LLVM attempted to do when DMB had been marked
; with isBarrier. For now, look for something that looks like "somewhere".
-; CHECK-NEXT: {{mov|bx}}
somewhere:
%combined = phi i32 [ %val, %atomic_ver ], [ %newval, %simple_ver]
ret i32 %combined
>From 2527ffa9a3af68646645a26b8bc9fc7a3c2ba613 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard at arm.com>
Date: Wed, 14 Aug 2024 15:44:42 +0100
Subject: [PATCH 2/4] [RegAllocFast] Allocate use operands in register class
priority order
---
llvm/lib/CodeGen/RegAllocFast.cpp | 41 ++++++++++++++++++++++++++++++-
1 file changed, 40 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 62f7ed29c8c819..6feefd87575e05 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -270,6 +270,8 @@ class RegAllocFastImpl {
SmallVector<unsigned, 0> UsedInInstr;
SmallVector<unsigned, 8> DefOperandIndexes;
+ SmallVector<unsigned, 8> UseOperandIndexes;
+
// Register masks attached to the current instruction.
SmallVector<const uint32_t *> RegMasks;
@@ -336,6 +338,7 @@ class RegAllocFastImpl {
Register Reg) const;
void findAndSortDefOperandIndexes(const MachineInstr &MI);
+ void findAndSortUseOperandIndexes(const MachineInstr &MI);
void allocateInstruction(MachineInstr &MI);
void handleDebugValue(MachineInstr &MI);
@@ -1368,6 +1371,40 @@ void RegAllocFastImpl::findAndSortDefOperandIndexes(const MachineInstr &MI) {
});
}
+/// Compute \ref UseOperandIndexes so it contains the indices of "use" operands
+/// that are to be allocated. Those are ordered in a way that high-priority
+/// classes are allocated first.
+void RegAllocFastImpl::findAndSortUseOperandIndexes(const MachineInstr &MI) {
+ UseOperandIndexes.clear();
+
+ for (unsigned I = 0, E = MI.getNumOperands(); I < E; ++I) {
+ const MachineOperand &MO = MI.getOperand(I);
+ if (!MO.isReg())
+ continue;
+ Register Reg = MO.getReg();
+ if (MO.isUse() && Reg.isVirtual() && shouldAllocateRegister(Reg))
+ UseOperandIndexes.push_back(I);
+ }
+
+ llvm::sort(UseOperandIndexes, [&](unsigned I0, unsigned I1) {
+ const MachineOperand &MO0 = MI.getOperand(I0);
+ const MachineOperand &MO1 = MI.getOperand(I1);
+ Register Reg0 = MO0.getReg();
+ Register Reg1 = MO1.getReg();
+ const TargetRegisterClass &RC0 = *MRI->getRegClass(Reg0);
+ const TargetRegisterClass &RC1 = *MRI->getRegClass(Reg1);
+
+ // Allocate register classes with a high allocation priority first.
+ if (RC0.AllocationPriority > RC1.AllocationPriority)
+ return true;
+ if (RC0.AllocationPriority < RC1.AllocationPriority)
+ return false;
+
+ // Tie-break rule: operand index.
+ return I0 < I1;
+ });
+}
+
// Returns true if MO is tied and the operand it's tied to is not Undef (not
// Undef is not the same thing as Def).
static bool isTiedToNotUndef(const MachineOperand &MO) {
@@ -1569,7 +1606,9 @@ void RegAllocFastImpl::allocateInstruction(MachineInstr &MI) {
bool ReArrangedImplicitMOs = true;
while (ReArrangedImplicitMOs) {
ReArrangedImplicitMOs = false;
- for (MachineOperand &MO : MI.operands()) {
+ findAndSortUseOperandIndexes(MI);
+ for (unsigned OpIdx : UseOperandIndexes) {
+ MachineOperand &MO = MI.getOperand(OpIdx);
if (!MO.isReg() || !MO.isUse())
continue;
Register Reg = MO.getReg();
>From 33d7fa3aa210b170b7fd7057055dfe24d1339fc1 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard at arm.com>
Date: Wed, 14 Aug 2024 15:45:26 +0100
Subject: [PATCH 3/4] [ARM] Fix failure to register-allocate CMP_SWAP_64
pseudo-inst
This test case was failing to compile with a "ran out of registers
during register allocation" error at -O0. This was because CMP_SWAP_64
has 3 operands which must be an even-odd register pair, and two other
GPR operands. All of the def operands are also early-clobber, so
registers can't be shared between uses and defs. Because the function
has an over-aligned alloca it needs frame and base pointers, so r6 and
r11 are both reserved. That leaves r0/r1, r2/r3, r4/r5 and r8/r9 as the
only valid register pairs, and if the two individual GPR operands happen
to get allocated to registers in different pairs then only 2 pairs will
be available for the three GPRPair operands.
The fix is to allocate the GPRPair operands first, because once they are
allocated they can't prevent the GPR operands from being allocated.
---
llvm/lib/Target/ARM/ARMRegisterInfo.td | 3 +
.../CodeGen/ARM/atomic-64bit-fast-regalloc.ll | 82 ++++
llvm/test/CodeGen/ARM/atomic-64bit.ll | 400 +++++++++---------
llvm/test/CodeGen/ARM/atomic-ops-v8.ll | 364 ++++++++--------
.../CodeGen/ARM/atomicrmw-uinc-udec-wrap.ll | 60 ++-
5 files changed, 496 insertions(+), 413 deletions(-)
create mode 100644 llvm/test/CodeGen/ARM/atomic-64bit-fast-regalloc.ll
diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.td b/llvm/lib/Target/ARM/ARMRegisterInfo.td
index 212f22651f9f94..fc7e591ff44c89 100644
--- a/llvm/lib/Target/ARM/ARMRegisterInfo.td
+++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -548,6 +548,9 @@ def Tuples2Rsp : RegisterTuples<[gsub_0, gsub_1],
// Register class representing a pair of even-odd GPRs.
def GPRPair : RegisterClass<"ARM", [untyped], 64, (add Tuples2Rnosp, Tuples2Rsp)> {
let Size = 64; // 2 x 32 bits, we have no predefined type of that size.
+ // GPRPair must be allocated before GPR so that CMP_SWAP_64 can always be
+ // allocated when both the frame pointer and base pointer are reserved.
+ let AllocationPriority = 1;
}
// Register class representing a pair of even-odd GPRs, except (R12, SP).
diff --git a/llvm/test/CodeGen/ARM/atomic-64bit-fast-regalloc.ll b/llvm/test/CodeGen/ARM/atomic-64bit-fast-regalloc.ll
new file mode 100644
index 00000000000000..e06064a1bf9f32
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/atomic-64bit-fast-regalloc.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=armv7-none-eabi -O0 | FileCheck %s
+
+;; Previously, this failed during register allocation because the CMP_SWAP_64
+;; pseudo-instruction has a lot of operands, many of which need to be even-odd
+;; register pairs, and the over-aligned alloca in this function causes both a
+;; frame pointer and a base pointer to be needed.
+
+define void @test(ptr %ptr) {
+; CHECK-LABEL: test:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r6, r8, r9, r10, r11, lr}
+; CHECK-NEXT: push {r4, r5, r6, r8, r9, r10, r11, lr}
+; CHECK-NEXT: .setfp r11, sp, #24
+; CHECK-NEXT: add r11, sp, #24
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: bfc sp, #0, #4
+; CHECK-NEXT: mov r6, sp
+; CHECK-NEXT: str r0, [r6, #12] @ 4-byte Spill
+; CHECK-NEXT: b .LBB0_1
+; CHECK-NEXT: .LBB0_1: @ %block1
+; CHECK-NEXT: ldr r0, [r6, #12] @ 4-byte Reload
+; CHECK-NEXT: mov r1, sp
+; CHECK-NEXT: sub r1, r1, #16
+; CHECK-NEXT: bic r1, r1, #15
+; CHECK-NEXT: mov sp, r1
+; CHECK-NEXT: dmb ish
+; CHECK-NEXT: ldr r1, [r0]
+; CHECK-NEXT: ldr r0, [r0, #4]
+; CHECK-NEXT: str r1, [r6, #4] @ 4-byte Spill
+; CHECK-NEXT: str r0, [r6, #8] @ 4-byte Spill
+; CHECK-NEXT: b .LBB0_2
+; CHECK-NEXT: .LBB0_2: @ %atomicrmw.start
+; CHECK-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-NEXT: @ Child Loop BB0_3 Depth 2
+; CHECK-NEXT: ldr r1, [r6, #8] @ 4-byte Reload
+; CHECK-NEXT: ldr r8, [r6, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r12, [r6, #12] @ 4-byte Reload
+; CHECK-NEXT: str r8, [r6] @ 4-byte Spill
+; CHECK-NEXT: @ kill: def $r8 killed $r8 def $r8_r9
+; CHECK-NEXT: mov r9, r1
+; CHECK-NEXT: mov r0, #0
+; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: mov r3, r0
+; CHECK-NEXT: .LBB0_3: @ %atomicrmw.start
+; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1
+; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT: ldrexd r4, r5, [r12]
+; CHECK-NEXT: cmp r4, r8
+; CHECK-NEXT: cmpeq r5, r9
+; CHECK-NEXT: bne .LBB0_5
+; CHECK-NEXT: @ %bb.4: @ %atomicrmw.start
+; CHECK-NEXT: @ in Loop: Header=BB0_3 Depth=2
+; CHECK-NEXT: strexd r0, r2, r3, [r12]
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: bne .LBB0_3
+; CHECK-NEXT: .LBB0_5: @ %atomicrmw.start
+; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT: ldr r2, [r6] @ 4-byte Reload
+; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: eor r3, r0, r1
+; CHECK-NEXT: mov r1, r4
+; CHECK-NEXT: eor r2, r1, r2
+; CHECK-NEXT: orr r2, r2, r3
+; CHECK-NEXT: cmp r2, #0
+; CHECK-NEXT: str r1, [r6, #4] @ 4-byte Spill
+; CHECK-NEXT: str r0, [r6, #8] @ 4-byte Spill
+; CHECK-NEXT: bne .LBB0_2
+; CHECK-NEXT: b .LBB0_6
+; CHECK-NEXT: .LBB0_6: @ %atomicrmw.end
+; CHECK-NEXT: dmb ish
+; CHECK-NEXT: sub sp, r11, #24
+; CHECK-NEXT: pop {r4, r5, r6, r8, r9, r10, r11, pc}
+entry:
+ br label %block1
+
+block1:
+ %stuff = alloca i8, i64 16, align 16
+ store atomic i64 0, ptr %ptr seq_cst, align 8
+ ret void
+}
diff --git a/llvm/test/CodeGen/ARM/atomic-64bit.ll b/llvm/test/CodeGen/ARM/atomic-64bit.ll
index 80658bdb9fda0a..84f1559fee14af 100644
--- a/llvm/test/CodeGen/ARM/atomic-64bit.ll
+++ b/llvm/test/CodeGen/ARM/atomic-64bit.ll
@@ -9,21 +9,21 @@
define i64 @test1(ptr %ptr, i64 %val) {
; CHECK-LE-LABEL: test1:
; CHECK-LE: @ %bb.0:
-; CHECK-LE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-LE-NEXT: push {r4, r5, lr}
+; CHECK-LE-NEXT: mov r9, r1
+; CHECK-LE-NEXT: mov r12, r0
; CHECK-LE-NEXT: dmb ish
; CHECK-LE-NEXT: LBB0_1: @ %atomicrmw.start
; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-LE-NEXT: ldrexd r4, r5, [r0]
-; CHECK-LE-NEXT: adds r6, r4, r1
-; CHECK-LE-NEXT: adc r7, r5, r2
-; CHECK-LE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-LE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-LE-NEXT: adds r4, r0, r9
+; CHECK-LE-NEXT: adc r5, r1, r2
+; CHECK-LE-NEXT: strexd r3, r4, r5, [r12]
; CHECK-LE-NEXT: cmp r3, #0
; CHECK-LE-NEXT: bne LBB0_1
; CHECK-LE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-LE-NEXT: mov r0, r4
-; CHECK-LE-NEXT: mov r1, r5
; CHECK-LE-NEXT: dmb ish
-; CHECK-LE-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-LE-NEXT: pop {r4, r5, pc}
;
; CHECK-THUMB-LE-LABEL: test1:
; CHECK-THUMB-LE: @ %bb.0:
@@ -45,21 +45,21 @@ define i64 @test1(ptr %ptr, i64 %val) {
;
; CHECK-BE-LABEL: test1:
; CHECK-BE: @ %bb.0:
-; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-BE-NEXT: push {r4, r5, lr}
+; CHECK-BE-NEXT: mov r12, r1
+; CHECK-BE-NEXT: mov lr, r0
; CHECK-BE-NEXT: dmb ish
; CHECK-BE-NEXT: .LBB0_1: @ %atomicrmw.start
; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT: ldrexd r4, r5, [r0]
-; CHECK-BE-NEXT: adds r7, r5, r2
-; CHECK-BE-NEXT: adc r6, r4, r1
-; CHECK-BE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-BE-NEXT: ldrexd r0, r1, [lr]
+; CHECK-BE-NEXT: adds r5, r1, r2
+; CHECK-BE-NEXT: adc r4, r0, r12
+; CHECK-BE-NEXT: strexd r3, r4, r5, [lr]
; CHECK-BE-NEXT: cmp r3, #0
; CHECK-BE-NEXT: bne .LBB0_1
; CHECK-BE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-BE-NEXT: mov r0, r4
-; CHECK-BE-NEXT: mov r1, r5
; CHECK-BE-NEXT: dmb ish
-; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-BE-NEXT: pop {r4, r5, pc}
;
; CHECK-THUMB-BE-LABEL: test1:
; CHECK-THUMB-BE: @ %bb.0:
@@ -113,21 +113,21 @@ define i64 @test1(ptr %ptr, i64 %val) {
define i64 @test2(ptr %ptr, i64 %val) {
; CHECK-LE-LABEL: test2:
; CHECK-LE: @ %bb.0:
-; CHECK-LE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-LE-NEXT: push {r4, r5, lr}
+; CHECK-LE-NEXT: mov r9, r1
+; CHECK-LE-NEXT: mov r12, r0
; CHECK-LE-NEXT: dmb ish
; CHECK-LE-NEXT: LBB1_1: @ %atomicrmw.start
; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-LE-NEXT: ldrexd r4, r5, [r0]
-; CHECK-LE-NEXT: subs r6, r4, r1
-; CHECK-LE-NEXT: sbc r7, r5, r2
-; CHECK-LE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-LE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-LE-NEXT: subs r4, r0, r9
+; CHECK-LE-NEXT: sbc r5, r1, r2
+; CHECK-LE-NEXT: strexd r3, r4, r5, [r12]
; CHECK-LE-NEXT: cmp r3, #0
; CHECK-LE-NEXT: bne LBB1_1
; CHECK-LE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-LE-NEXT: mov r0, r4
-; CHECK-LE-NEXT: mov r1, r5
; CHECK-LE-NEXT: dmb ish
-; CHECK-LE-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-LE-NEXT: pop {r4, r5, pc}
;
; CHECK-THUMB-LE-LABEL: test2:
; CHECK-THUMB-LE: @ %bb.0:
@@ -149,21 +149,21 @@ define i64 @test2(ptr %ptr, i64 %val) {
;
; CHECK-BE-LABEL: test2:
; CHECK-BE: @ %bb.0:
-; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-BE-NEXT: push {r4, r5, lr}
+; CHECK-BE-NEXT: mov r12, r1
+; CHECK-BE-NEXT: mov lr, r0
; CHECK-BE-NEXT: dmb ish
; CHECK-BE-NEXT: .LBB1_1: @ %atomicrmw.start
; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT: ldrexd r4, r5, [r0]
-; CHECK-BE-NEXT: subs r7, r5, r2
-; CHECK-BE-NEXT: sbc r6, r4, r1
-; CHECK-BE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-BE-NEXT: ldrexd r0, r1, [lr]
+; CHECK-BE-NEXT: subs r5, r1, r2
+; CHECK-BE-NEXT: sbc r4, r0, r12
+; CHECK-BE-NEXT: strexd r3, r4, r5, [lr]
; CHECK-BE-NEXT: cmp r3, #0
; CHECK-BE-NEXT: bne .LBB1_1
; CHECK-BE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-BE-NEXT: mov r0, r4
-; CHECK-BE-NEXT: mov r1, r5
; CHECK-BE-NEXT: dmb ish
-; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-BE-NEXT: pop {r4, r5, pc}
;
; CHECK-THUMB-BE-LABEL: test2:
; CHECK-THUMB-BE: @ %bb.0:
@@ -217,21 +217,21 @@ define i64 @test2(ptr %ptr, i64 %val) {
define i64 @test3(ptr %ptr, i64 %val) {
; CHECK-LE-LABEL: test3:
; CHECK-LE: @ %bb.0:
-; CHECK-LE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-LE-NEXT: push {r4, r5, lr}
+; CHECK-LE-NEXT: mov r9, r1
+; CHECK-LE-NEXT: mov r12, r0
; CHECK-LE-NEXT: dmb ish
; CHECK-LE-NEXT: LBB2_1: @ %atomicrmw.start
; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-LE-NEXT: ldrexd r4, r5, [r0]
-; CHECK-LE-NEXT: and r6, r4, r1
-; CHECK-LE-NEXT: and r7, r5, r2
-; CHECK-LE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-LE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-LE-NEXT: and r4, r0, r9
+; CHECK-LE-NEXT: and r5, r1, r2
+; CHECK-LE-NEXT: strexd r3, r4, r5, [r12]
; CHECK-LE-NEXT: cmp r3, #0
; CHECK-LE-NEXT: bne LBB2_1
; CHECK-LE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-LE-NEXT: mov r0, r4
-; CHECK-LE-NEXT: mov r1, r5
; CHECK-LE-NEXT: dmb ish
-; CHECK-LE-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-LE-NEXT: pop {r4, r5, pc}
;
; CHECK-THUMB-LABEL: test3:
; CHECK-THUMB: @ %bb.0:
@@ -253,21 +253,21 @@ define i64 @test3(ptr %ptr, i64 %val) {
;
; CHECK-BE-LABEL: test3:
; CHECK-BE: @ %bb.0:
-; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-BE-NEXT: push {r4, r5, lr}
+; CHECK-BE-NEXT: mov r12, r1
+; CHECK-BE-NEXT: mov lr, r0
; CHECK-BE-NEXT: dmb ish
; CHECK-BE-NEXT: .LBB2_1: @ %atomicrmw.start
; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT: ldrexd r4, r5, [r0]
-; CHECK-BE-NEXT: and r6, r4, r1
-; CHECK-BE-NEXT: and r7, r5, r2
-; CHECK-BE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-BE-NEXT: ldrexd r0, r1, [lr]
+; CHECK-BE-NEXT: and r4, r0, r12
+; CHECK-BE-NEXT: and r5, r1, r2
+; CHECK-BE-NEXT: strexd r3, r4, r5, [lr]
; CHECK-BE-NEXT: cmp r3, #0
; CHECK-BE-NEXT: bne .LBB2_1
; CHECK-BE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-BE-NEXT: mov r0, r4
-; CHECK-BE-NEXT: mov r1, r5
; CHECK-BE-NEXT: dmb ish
-; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-BE-NEXT: pop {r4, r5, pc}
;
; CHECK-7M-LABEL: test3:
; CHECK-7M: @ %bb.0:
@@ -303,21 +303,21 @@ define i64 @test3(ptr %ptr, i64 %val) {
define i64 @test4(ptr %ptr, i64 %val) {
; CHECK-LE-LABEL: test4:
; CHECK-LE: @ %bb.0:
-; CHECK-LE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-LE-NEXT: push {r4, r5, lr}
+; CHECK-LE-NEXT: mov r9, r1
+; CHECK-LE-NEXT: mov r12, r0
; CHECK-LE-NEXT: dmb ish
; CHECK-LE-NEXT: LBB3_1: @ %atomicrmw.start
; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-LE-NEXT: ldrexd r4, r5, [r0]
-; CHECK-LE-NEXT: orr r6, r4, r1
-; CHECK-LE-NEXT: orr r7, r5, r2
-; CHECK-LE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-LE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-LE-NEXT: orr r4, r0, r9
+; CHECK-LE-NEXT: orr r5, r1, r2
+; CHECK-LE-NEXT: strexd r3, r4, r5, [r12]
; CHECK-LE-NEXT: cmp r3, #0
; CHECK-LE-NEXT: bne LBB3_1
; CHECK-LE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-LE-NEXT: mov r0, r4
-; CHECK-LE-NEXT: mov r1, r5
; CHECK-LE-NEXT: dmb ish
-; CHECK-LE-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-LE-NEXT: pop {r4, r5, pc}
;
; CHECK-THUMB-LABEL: test4:
; CHECK-THUMB: @ %bb.0:
@@ -339,21 +339,21 @@ define i64 @test4(ptr %ptr, i64 %val) {
;
; CHECK-BE-LABEL: test4:
; CHECK-BE: @ %bb.0:
-; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-BE-NEXT: push {r4, r5, lr}
+; CHECK-BE-NEXT: mov r12, r1
+; CHECK-BE-NEXT: mov lr, r0
; CHECK-BE-NEXT: dmb ish
; CHECK-BE-NEXT: .LBB3_1: @ %atomicrmw.start
; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT: ldrexd r4, r5, [r0]
-; CHECK-BE-NEXT: orr r6, r4, r1
-; CHECK-BE-NEXT: orr r7, r5, r2
-; CHECK-BE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-BE-NEXT: ldrexd r0, r1, [lr]
+; CHECK-BE-NEXT: orr r4, r0, r12
+; CHECK-BE-NEXT: orr r5, r1, r2
+; CHECK-BE-NEXT: strexd r3, r4, r5, [lr]
; CHECK-BE-NEXT: cmp r3, #0
; CHECK-BE-NEXT: bne .LBB3_1
; CHECK-BE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-BE-NEXT: mov r0, r4
-; CHECK-BE-NEXT: mov r1, r5
; CHECK-BE-NEXT: dmb ish
-; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-BE-NEXT: pop {r4, r5, pc}
;
; CHECK-7M-LABEL: test4:
; CHECK-7M: @ %bb.0:
@@ -389,21 +389,21 @@ define i64 @test4(ptr %ptr, i64 %val) {
define i64 @test5(ptr %ptr, i64 %val) {
; CHECK-LE-LABEL: test5:
; CHECK-LE: @ %bb.0:
-; CHECK-LE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-LE-NEXT: push {r4, r5, lr}
+; CHECK-LE-NEXT: mov r9, r1
+; CHECK-LE-NEXT: mov r12, r0
; CHECK-LE-NEXT: dmb ish
; CHECK-LE-NEXT: LBB4_1: @ %atomicrmw.start
; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-LE-NEXT: ldrexd r4, r5, [r0]
-; CHECK-LE-NEXT: eor r6, r4, r1
-; CHECK-LE-NEXT: eor r7, r5, r2
-; CHECK-LE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-LE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-LE-NEXT: eor r4, r0, r9
+; CHECK-LE-NEXT: eor r5, r1, r2
+; CHECK-LE-NEXT: strexd r3, r4, r5, [r12]
; CHECK-LE-NEXT: cmp r3, #0
; CHECK-LE-NEXT: bne LBB4_1
; CHECK-LE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-LE-NEXT: mov r0, r4
-; CHECK-LE-NEXT: mov r1, r5
; CHECK-LE-NEXT: dmb ish
-; CHECK-LE-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-LE-NEXT: pop {r4, r5, pc}
;
; CHECK-THUMB-LABEL: test5:
; CHECK-THUMB: @ %bb.0:
@@ -425,21 +425,21 @@ define i64 @test5(ptr %ptr, i64 %val) {
;
; CHECK-BE-LABEL: test5:
; CHECK-BE: @ %bb.0:
-; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-BE-NEXT: push {r4, r5, lr}
+; CHECK-BE-NEXT: mov r12, r1
+; CHECK-BE-NEXT: mov lr, r0
; CHECK-BE-NEXT: dmb ish
; CHECK-BE-NEXT: .LBB4_1: @ %atomicrmw.start
; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT: ldrexd r4, r5, [r0]
-; CHECK-BE-NEXT: eor r6, r4, r1
-; CHECK-BE-NEXT: eor r7, r5, r2
-; CHECK-BE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-BE-NEXT: ldrexd r0, r1, [lr]
+; CHECK-BE-NEXT: eor r4, r0, r12
+; CHECK-BE-NEXT: eor r5, r1, r2
+; CHECK-BE-NEXT: strexd r3, r4, r5, [lr]
; CHECK-BE-NEXT: cmp r3, #0
; CHECK-BE-NEXT: bne .LBB4_1
; CHECK-BE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-BE-NEXT: mov r0, r4
-; CHECK-BE-NEXT: mov r1, r5
; CHECK-BE-NEXT: dmb ish
-; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-BE-NEXT: pop {r4, r5, pc}
;
; CHECK-7M-LABEL: test5:
; CHECK-7M: @ %bb.0:
@@ -846,28 +846,28 @@ define void @test9(ptr %ptr, i64 %val) {
define i64 @test10(ptr %ptr, i64 %val) {
; CHECK-LE-LABEL: test10:
; CHECK-LE: @ %bb.0:
-; CHECK-LE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-LE-NEXT: push {r4, r5, lr}
+; CHECK-LE-NEXT: mov r9, r1
+; CHECK-LE-NEXT: mov r12, r0
; CHECK-LE-NEXT: dmb ish
; CHECK-LE-NEXT: LBB9_1: @ %atomicrmw.start
; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-LE-NEXT: ldrexd r4, r5, [r0]
-; CHECK-LE-NEXT: mov r7, r2
-; CHECK-LE-NEXT: subs r3, r1, r4
-; CHECK-LE-NEXT: sbcs r3, r2, r5
-; CHECK-LE-NEXT: mov r3, #0
-; CHECK-LE-NEXT: movwge r3, #1
-; CHECK-LE-NEXT: cmp r3, #0
-; CHECK-LE-NEXT: movne r7, r5
-; CHECK-LE-NEXT: mov r6, r1
-; CHECK-LE-NEXT: movne r6, r4
-; CHECK-LE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-LE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-LE-NEXT: subs r5, r9, r0
+; CHECK-LE-NEXT: sbcs r5, r2, r1
+; CHECK-LE-NEXT: mov r5, #0
+; CHECK-LE-NEXT: movwge r5, #1
+; CHECK-LE-NEXT: cmp r5, #0
+; CHECK-LE-NEXT: mov r5, r2
+; CHECK-LE-NEXT: movne r5, r1
+; CHECK-LE-NEXT: mov r4, r9
+; CHECK-LE-NEXT: movne r4, r0
+; CHECK-LE-NEXT: strexd r3, r4, r5, [r12]
; CHECK-LE-NEXT: cmp r3, #0
; CHECK-LE-NEXT: bne LBB9_1
; CHECK-LE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-LE-NEXT: mov r0, r4
-; CHECK-LE-NEXT: mov r1, r5
; CHECK-LE-NEXT: dmb ish
-; CHECK-LE-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-LE-NEXT: pop {r4, r5, pc}
;
; CHECK-THUMB-LE-LABEL: test10:
; CHECK-THUMB-LE: @ %bb.0:
@@ -898,28 +898,28 @@ define i64 @test10(ptr %ptr, i64 %val) {
;
; CHECK-BE-LABEL: test10:
; CHECK-BE: @ %bb.0:
-; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-BE-NEXT: push {r4, r5, lr}
+; CHECK-BE-NEXT: mov r12, r1
+; CHECK-BE-NEXT: mov lr, r0
; CHECK-BE-NEXT: dmb ish
; CHECK-BE-NEXT: .LBB9_1: @ %atomicrmw.start
; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT: ldrexd r4, r5, [r0]
-; CHECK-BE-NEXT: mov r7, r2
-; CHECK-BE-NEXT: subs r3, r2, r5
-; CHECK-BE-NEXT: sbcs r3, r1, r4
-; CHECK-BE-NEXT: mov r3, #0
-; CHECK-BE-NEXT: movwge r3, #1
-; CHECK-BE-NEXT: cmp r3, #0
-; CHECK-BE-NEXT: movne r7, r5
-; CHECK-BE-NEXT: mov r6, r1
-; CHECK-BE-NEXT: movne r6, r4
-; CHECK-BE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-BE-NEXT: ldrexd r0, r1, [lr]
+; CHECK-BE-NEXT: subs r5, r2, r1
+; CHECK-BE-NEXT: sbcs r5, r12, r0
+; CHECK-BE-NEXT: mov r5, #0
+; CHECK-BE-NEXT: movwge r5, #1
+; CHECK-BE-NEXT: cmp r5, #0
+; CHECK-BE-NEXT: mov r5, r2
+; CHECK-BE-NEXT: movne r5, r1
+; CHECK-BE-NEXT: mov r4, r12
+; CHECK-BE-NEXT: movne r4, r0
+; CHECK-BE-NEXT: strexd r3, r4, r5, [lr]
; CHECK-BE-NEXT: cmp r3, #0
; CHECK-BE-NEXT: bne .LBB9_1
; CHECK-BE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-BE-NEXT: mov r0, r4
-; CHECK-BE-NEXT: mov r1, r5
; CHECK-BE-NEXT: dmb ish
-; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-BE-NEXT: pop {r4, r5, pc}
;
; CHECK-THUMB-BE-LABEL: test10:
; CHECK-THUMB-BE: @ %bb.0:
@@ -1033,28 +1033,28 @@ define i64 @test10(ptr %ptr, i64 %val) {
define i64 @test11(ptr %ptr, i64 %val) {
; CHECK-LE-LABEL: test11:
; CHECK-LE: @ %bb.0:
-; CHECK-LE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-LE-NEXT: push {r4, r5, lr}
+; CHECK-LE-NEXT: mov r9, r1
+; CHECK-LE-NEXT: mov r12, r0
; CHECK-LE-NEXT: dmb ish
; CHECK-LE-NEXT: LBB10_1: @ %atomicrmw.start
; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-LE-NEXT: ldrexd r4, r5, [r0]
-; CHECK-LE-NEXT: mov r7, r2
-; CHECK-LE-NEXT: subs r3, r1, r4
-; CHECK-LE-NEXT: sbcs r3, r2, r5
-; CHECK-LE-NEXT: mov r3, #0
-; CHECK-LE-NEXT: movwhs r3, #1
-; CHECK-LE-NEXT: cmp r3, #0
-; CHECK-LE-NEXT: movne r7, r5
-; CHECK-LE-NEXT: mov r6, r1
-; CHECK-LE-NEXT: movne r6, r4
-; CHECK-LE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-LE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-LE-NEXT: subs r5, r9, r0
+; CHECK-LE-NEXT: sbcs r5, r2, r1
+; CHECK-LE-NEXT: mov r5, #0
+; CHECK-LE-NEXT: movwhs r5, #1
+; CHECK-LE-NEXT: cmp r5, #0
+; CHECK-LE-NEXT: mov r5, r2
+; CHECK-LE-NEXT: movne r5, r1
+; CHECK-LE-NEXT: mov r4, r9
+; CHECK-LE-NEXT: movne r4, r0
+; CHECK-LE-NEXT: strexd r3, r4, r5, [r12]
; CHECK-LE-NEXT: cmp r3, #0
; CHECK-LE-NEXT: bne LBB10_1
; CHECK-LE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-LE-NEXT: mov r0, r4
-; CHECK-LE-NEXT: mov r1, r5
; CHECK-LE-NEXT: dmb ish
-; CHECK-LE-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-LE-NEXT: pop {r4, r5, pc}
;
; CHECK-THUMB-LE-LABEL: test11:
; CHECK-THUMB-LE: @ %bb.0:
@@ -1085,28 +1085,28 @@ define i64 @test11(ptr %ptr, i64 %val) {
;
; CHECK-BE-LABEL: test11:
; CHECK-BE: @ %bb.0:
-; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-BE-NEXT: push {r4, r5, lr}
+; CHECK-BE-NEXT: mov r12, r1
+; CHECK-BE-NEXT: mov lr, r0
; CHECK-BE-NEXT: dmb ish
; CHECK-BE-NEXT: .LBB10_1: @ %atomicrmw.start
; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT: ldrexd r4, r5, [r0]
-; CHECK-BE-NEXT: mov r7, r2
-; CHECK-BE-NEXT: subs r3, r2, r5
-; CHECK-BE-NEXT: sbcs r3, r1, r4
-; CHECK-BE-NEXT: mov r3, #0
-; CHECK-BE-NEXT: movwhs r3, #1
-; CHECK-BE-NEXT: cmp r3, #0
-; CHECK-BE-NEXT: movne r7, r5
-; CHECK-BE-NEXT: mov r6, r1
-; CHECK-BE-NEXT: movne r6, r4
-; CHECK-BE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-BE-NEXT: ldrexd r0, r1, [lr]
+; CHECK-BE-NEXT: subs r5, r2, r1
+; CHECK-BE-NEXT: sbcs r5, r12, r0
+; CHECK-BE-NEXT: mov r5, #0
+; CHECK-BE-NEXT: movwhs r5, #1
+; CHECK-BE-NEXT: cmp r5, #0
+; CHECK-BE-NEXT: mov r5, r2
+; CHECK-BE-NEXT: movne r5, r1
+; CHECK-BE-NEXT: mov r4, r12
+; CHECK-BE-NEXT: movne r4, r0
+; CHECK-BE-NEXT: strexd r3, r4, r5, [lr]
; CHECK-BE-NEXT: cmp r3, #0
; CHECK-BE-NEXT: bne .LBB10_1
; CHECK-BE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-BE-NEXT: mov r0, r4
-; CHECK-BE-NEXT: mov r1, r5
; CHECK-BE-NEXT: dmb ish
-; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-BE-NEXT: pop {r4, r5, pc}
;
; CHECK-THUMB-BE-LABEL: test11:
; CHECK-THUMB-BE: @ %bb.0:
@@ -1220,28 +1220,28 @@ define i64 @test11(ptr %ptr, i64 %val) {
define i64 @test12(ptr %ptr, i64 %val) {
; CHECK-LE-LABEL: test12:
; CHECK-LE: @ %bb.0:
-; CHECK-LE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-LE-NEXT: push {r4, r5, lr}
+; CHECK-LE-NEXT: mov r9, r1
+; CHECK-LE-NEXT: mov r12, r0
; CHECK-LE-NEXT: dmb ish
; CHECK-LE-NEXT: LBB11_1: @ %atomicrmw.start
; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-LE-NEXT: ldrexd r4, r5, [r0]
-; CHECK-LE-NEXT: mov r7, r2
-; CHECK-LE-NEXT: subs r3, r1, r4
-; CHECK-LE-NEXT: sbcs r3, r2, r5
-; CHECK-LE-NEXT: mov r3, #0
-; CHECK-LE-NEXT: movwlt r3, #1
-; CHECK-LE-NEXT: cmp r3, #0
-; CHECK-LE-NEXT: movne r7, r5
-; CHECK-LE-NEXT: mov r6, r1
-; CHECK-LE-NEXT: movne r6, r4
-; CHECK-LE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-LE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-LE-NEXT: subs r5, r9, r0
+; CHECK-LE-NEXT: sbcs r5, r2, r1
+; CHECK-LE-NEXT: mov r5, #0
+; CHECK-LE-NEXT: movwlt r5, #1
+; CHECK-LE-NEXT: cmp r5, #0
+; CHECK-LE-NEXT: mov r5, r2
+; CHECK-LE-NEXT: movne r5, r1
+; CHECK-LE-NEXT: mov r4, r9
+; CHECK-LE-NEXT: movne r4, r0
+; CHECK-LE-NEXT: strexd r3, r4, r5, [r12]
; CHECK-LE-NEXT: cmp r3, #0
; CHECK-LE-NEXT: bne LBB11_1
; CHECK-LE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-LE-NEXT: mov r0, r4
-; CHECK-LE-NEXT: mov r1, r5
; CHECK-LE-NEXT: dmb ish
-; CHECK-LE-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-LE-NEXT: pop {r4, r5, pc}
;
; CHECK-THUMB-LE-LABEL: test12:
; CHECK-THUMB-LE: @ %bb.0:
@@ -1272,28 +1272,28 @@ define i64 @test12(ptr %ptr, i64 %val) {
;
; CHECK-BE-LABEL: test12:
; CHECK-BE: @ %bb.0:
-; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-BE-NEXT: push {r4, r5, lr}
+; CHECK-BE-NEXT: mov r12, r1
+; CHECK-BE-NEXT: mov lr, r0
; CHECK-BE-NEXT: dmb ish
; CHECK-BE-NEXT: .LBB11_1: @ %atomicrmw.start
; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT: ldrexd r4, r5, [r0]
-; CHECK-BE-NEXT: mov r7, r2
-; CHECK-BE-NEXT: subs r3, r2, r5
-; CHECK-BE-NEXT: sbcs r3, r1, r4
-; CHECK-BE-NEXT: mov r3, #0
-; CHECK-BE-NEXT: movwlt r3, #1
-; CHECK-BE-NEXT: cmp r3, #0
-; CHECK-BE-NEXT: movne r7, r5
-; CHECK-BE-NEXT: mov r6, r1
-; CHECK-BE-NEXT: movne r6, r4
-; CHECK-BE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-BE-NEXT: ldrexd r0, r1, [lr]
+; CHECK-BE-NEXT: subs r5, r2, r1
+; CHECK-BE-NEXT: sbcs r5, r12, r0
+; CHECK-BE-NEXT: mov r5, #0
+; CHECK-BE-NEXT: movwlt r5, #1
+; CHECK-BE-NEXT: cmp r5, #0
+; CHECK-BE-NEXT: mov r5, r2
+; CHECK-BE-NEXT: movne r5, r1
+; CHECK-BE-NEXT: mov r4, r12
+; CHECK-BE-NEXT: movne r4, r0
+; CHECK-BE-NEXT: strexd r3, r4, r5, [lr]
; CHECK-BE-NEXT: cmp r3, #0
; CHECK-BE-NEXT: bne .LBB11_1
; CHECK-BE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-BE-NEXT: mov r0, r4
-; CHECK-BE-NEXT: mov r1, r5
; CHECK-BE-NEXT: dmb ish
-; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-BE-NEXT: pop {r4, r5, pc}
;
; CHECK-THUMB-BE-LABEL: test12:
; CHECK-THUMB-BE: @ %bb.0:
@@ -1407,28 +1407,28 @@ define i64 @test12(ptr %ptr, i64 %val) {
define i64 @test13(ptr %ptr, i64 %val) {
; CHECK-LE-LABEL: test13:
; CHECK-LE: @ %bb.0:
-; CHECK-LE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-LE-NEXT: push {r4, r5, lr}
+; CHECK-LE-NEXT: mov r9, r1
+; CHECK-LE-NEXT: mov r12, r0
; CHECK-LE-NEXT: dmb ish
; CHECK-LE-NEXT: LBB12_1: @ %atomicrmw.start
; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-LE-NEXT: ldrexd r4, r5, [r0]
-; CHECK-LE-NEXT: mov r7, r2
-; CHECK-LE-NEXT: subs r3, r1, r4
-; CHECK-LE-NEXT: sbcs r3, r2, r5
-; CHECK-LE-NEXT: mov r3, #0
-; CHECK-LE-NEXT: movwlo r3, #1
-; CHECK-LE-NEXT: cmp r3, #0
-; CHECK-LE-NEXT: movne r7, r5
-; CHECK-LE-NEXT: mov r6, r1
-; CHECK-LE-NEXT: movne r6, r4
-; CHECK-LE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-LE-NEXT: ldrexd r0, r1, [r12]
+; CHECK-LE-NEXT: subs r5, r9, r0
+; CHECK-LE-NEXT: sbcs r5, r2, r1
+; CHECK-LE-NEXT: mov r5, #0
+; CHECK-LE-NEXT: movwlo r5, #1
+; CHECK-LE-NEXT: cmp r5, #0
+; CHECK-LE-NEXT: mov r5, r2
+; CHECK-LE-NEXT: movne r5, r1
+; CHECK-LE-NEXT: mov r4, r9
+; CHECK-LE-NEXT: movne r4, r0
+; CHECK-LE-NEXT: strexd r3, r4, r5, [r12]
; CHECK-LE-NEXT: cmp r3, #0
; CHECK-LE-NEXT: bne LBB12_1
; CHECK-LE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-LE-NEXT: mov r0, r4
-; CHECK-LE-NEXT: mov r1, r5
; CHECK-LE-NEXT: dmb ish
-; CHECK-LE-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-LE-NEXT: pop {r4, r5, pc}
;
; CHECK-THUMB-LE-LABEL: test13:
; CHECK-THUMB-LE: @ %bb.0:
@@ -1459,28 +1459,28 @@ define i64 @test13(ptr %ptr, i64 %val) {
;
; CHECK-BE-LABEL: test13:
; CHECK-BE: @ %bb.0:
-; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-BE-NEXT: push {r4, r5, lr}
+; CHECK-BE-NEXT: mov r12, r1
+; CHECK-BE-NEXT: mov lr, r0
; CHECK-BE-NEXT: dmb ish
; CHECK-BE-NEXT: .LBB12_1: @ %atomicrmw.start
; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT: ldrexd r4, r5, [r0]
-; CHECK-BE-NEXT: mov r7, r2
-; CHECK-BE-NEXT: subs r3, r2, r5
-; CHECK-BE-NEXT: sbcs r3, r1, r4
-; CHECK-BE-NEXT: mov r3, #0
-; CHECK-BE-NEXT: movwlo r3, #1
-; CHECK-BE-NEXT: cmp r3, #0
-; CHECK-BE-NEXT: movne r7, r5
-; CHECK-BE-NEXT: mov r6, r1
-; CHECK-BE-NEXT: movne r6, r4
-; CHECK-BE-NEXT: strexd r3, r6, r7, [r0]
+; CHECK-BE-NEXT: ldrexd r0, r1, [lr]
+; CHECK-BE-NEXT: subs r5, r2, r1
+; CHECK-BE-NEXT: sbcs r5, r12, r0
+; CHECK-BE-NEXT: mov r5, #0
+; CHECK-BE-NEXT: movwlo r5, #1
+; CHECK-BE-NEXT: cmp r5, #0
+; CHECK-BE-NEXT: mov r5, r2
+; CHECK-BE-NEXT: movne r5, r1
+; CHECK-BE-NEXT: mov r4, r12
+; CHECK-BE-NEXT: movne r4, r0
+; CHECK-BE-NEXT: strexd r3, r4, r5, [lr]
; CHECK-BE-NEXT: cmp r3, #0
; CHECK-BE-NEXT: bne .LBB12_1
; CHECK-BE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-BE-NEXT: mov r0, r4
-; CHECK-BE-NEXT: mov r1, r5
; CHECK-BE-NEXT: dmb ish
-; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-BE-NEXT: pop {r4, r5, pc}
;
; CHECK-THUMB-BE-LABEL: test13:
; CHECK-THUMB-BE: @ %bb.0:
diff --git a/llvm/test/CodeGen/ARM/atomic-ops-v8.ll b/llvm/test/CodeGen/ARM/atomic-ops-v8.ll
index c7d4cf5912d179..cc760d59c20454 100644
--- a/llvm/test/CodeGen/ARM/atomic-ops-v8.ll
+++ b/llvm/test/CodeGen/ARM/atomic-ops-v8.ll
@@ -127,35 +127,35 @@ define void @test_atomic_load_add_i64(i64 %offset) nounwind {
; CHECK-ARM-LE-LABEL: test_atomic_load_add_i64:
; CHECK-ARM-LE: @ %bb.0:
; CHECK-ARM-LE-NEXT: push {r4, r5, r6, r7, r11, lr}
-; CHECK-ARM-LE-NEXT: movw r2, :lower16:var64
-; CHECK-ARM-LE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-LE-NEXT: movw r12, :lower16:var64
+; CHECK-ARM-LE-NEXT: movt r12, :upper16:var64
; CHECK-ARM-LE-NEXT: .LBB3_1: @ %atomicrmw.start
; CHECK-ARM-LE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-ARM-LE-NEXT: ldrexd r4, r5, [r2]
-; CHECK-ARM-LE-NEXT: adds r6, r4, r0
-; CHECK-ARM-LE-NEXT: adc r7, r5, r1
-; CHECK-ARM-LE-NEXT: strexd r3, r6, r7, [r2]
-; CHECK-ARM-LE-NEXT: cmp r3, #0
+; CHECK-ARM-LE-NEXT: ldrexd r6, r7, [r12]
+; CHECK-ARM-LE-NEXT: adds r4, r6, r0
+; CHECK-ARM-LE-NEXT: adc r5, r7, r1
+; CHECK-ARM-LE-NEXT: strexd r2, r4, r5, [r12]
+; CHECK-ARM-LE-NEXT: cmp r2, #0
; CHECK-ARM-LE-NEXT: bne .LBB3_1
; CHECK-ARM-LE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-ARM-LE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-LE-NEXT: strd r6, r7, [r12]
; CHECK-ARM-LE-NEXT: pop {r4, r5, r6, r7, r11, pc}
;
; CHECK-ARM-BE-LABEL: test_atomic_load_add_i64:
; CHECK-ARM-BE: @ %bb.0:
; CHECK-ARM-BE-NEXT: push {r4, r5, r6, r7, r11, lr}
-; CHECK-ARM-BE-NEXT: movw r2, :lower16:var64
-; CHECK-ARM-BE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-BE-NEXT: movw r12, :lower16:var64
+; CHECK-ARM-BE-NEXT: movt r12, :upper16:var64
; CHECK-ARM-BE-NEXT: .LBB3_1: @ %atomicrmw.start
; CHECK-ARM-BE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-ARM-BE-NEXT: ldrexd r4, r5, [r2]
-; CHECK-ARM-BE-NEXT: adds r7, r5, r1
-; CHECK-ARM-BE-NEXT: adc r6, r4, r0
-; CHECK-ARM-BE-NEXT: strexd r3, r6, r7, [r2]
-; CHECK-ARM-BE-NEXT: cmp r3, #0
+; CHECK-ARM-BE-NEXT: ldrexd r6, r7, [r12]
+; CHECK-ARM-BE-NEXT: adds r5, r7, r1
+; CHECK-ARM-BE-NEXT: adc r4, r6, r0
+; CHECK-ARM-BE-NEXT: strexd r2, r4, r5, [r12]
+; CHECK-ARM-BE-NEXT: cmp r2, #0
; CHECK-ARM-BE-NEXT: bne .LBB3_1
; CHECK-ARM-BE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-ARM-BE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-BE-NEXT: strd r6, r7, [r12]
; CHECK-ARM-BE-NEXT: pop {r4, r5, r6, r7, r11, pc}
;
; CHECK-THUMB-LE-LABEL: test_atomic_load_add_i64:
@@ -318,35 +318,35 @@ define void @test_atomic_load_sub_i64(i64 %offset) nounwind {
; CHECK-ARM-LE-LABEL: test_atomic_load_sub_i64:
; CHECK-ARM-LE: @ %bb.0:
; CHECK-ARM-LE-NEXT: push {r4, r5, r6, r7, r11, lr}
-; CHECK-ARM-LE-NEXT: movw r2, :lower16:var64
-; CHECK-ARM-LE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-LE-NEXT: movw r12, :lower16:var64
+; CHECK-ARM-LE-NEXT: movt r12, :upper16:var64
; CHECK-ARM-LE-NEXT: .LBB7_1: @ %atomicrmw.start
; CHECK-ARM-LE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-ARM-LE-NEXT: ldaexd r4, r5, [r2]
-; CHECK-ARM-LE-NEXT: subs r6, r4, r0
-; CHECK-ARM-LE-NEXT: sbc r7, r5, r1
-; CHECK-ARM-LE-NEXT: stlexd r3, r6, r7, [r2]
-; CHECK-ARM-LE-NEXT: cmp r3, #0
+; CHECK-ARM-LE-NEXT: ldaexd r6, r7, [r12]
+; CHECK-ARM-LE-NEXT: subs r4, r6, r0
+; CHECK-ARM-LE-NEXT: sbc r5, r7, r1
+; CHECK-ARM-LE-NEXT: stlexd r2, r4, r5, [r12]
+; CHECK-ARM-LE-NEXT: cmp r2, #0
; CHECK-ARM-LE-NEXT: bne .LBB7_1
; CHECK-ARM-LE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-ARM-LE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-LE-NEXT: strd r6, r7, [r12]
; CHECK-ARM-LE-NEXT: pop {r4, r5, r6, r7, r11, pc}
;
; CHECK-ARM-BE-LABEL: test_atomic_load_sub_i64:
; CHECK-ARM-BE: @ %bb.0:
; CHECK-ARM-BE-NEXT: push {r4, r5, r6, r7, r11, lr}
-; CHECK-ARM-BE-NEXT: movw r2, :lower16:var64
-; CHECK-ARM-BE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-BE-NEXT: movw r12, :lower16:var64
+; CHECK-ARM-BE-NEXT: movt r12, :upper16:var64
; CHECK-ARM-BE-NEXT: .LBB7_1: @ %atomicrmw.start
; CHECK-ARM-BE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-ARM-BE-NEXT: ldaexd r4, r5, [r2]
-; CHECK-ARM-BE-NEXT: subs r7, r5, r1
-; CHECK-ARM-BE-NEXT: sbc r6, r4, r0
-; CHECK-ARM-BE-NEXT: stlexd r3, r6, r7, [r2]
-; CHECK-ARM-BE-NEXT: cmp r3, #0
+; CHECK-ARM-BE-NEXT: ldaexd r6, r7, [r12]
+; CHECK-ARM-BE-NEXT: subs r5, r7, r1
+; CHECK-ARM-BE-NEXT: sbc r4, r6, r0
+; CHECK-ARM-BE-NEXT: stlexd r2, r4, r5, [r12]
+; CHECK-ARM-BE-NEXT: cmp r2, #0
; CHECK-ARM-BE-NEXT: bne .LBB7_1
; CHECK-ARM-BE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-ARM-BE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-BE-NEXT: strd r6, r7, [r12]
; CHECK-ARM-BE-NEXT: pop {r4, r5, r6, r7, r11, pc}
;
; CHECK-THUMB-LE-LABEL: test_atomic_load_sub_i64:
@@ -509,18 +509,18 @@ define void @test_atomic_load_and_i64(i64 %offset) nounwind {
; CHECK-ARM-LABEL: test_atomic_load_and_i64:
; CHECK-ARM: @ %bb.0:
; CHECK-ARM-NEXT: push {r4, r5, r6, r7, r11, lr}
-; CHECK-ARM-NEXT: movw r2, :lower16:var64
-; CHECK-ARM-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-NEXT: movw r12, :lower16:var64
+; CHECK-ARM-NEXT: movt r12, :upper16:var64
; CHECK-ARM-NEXT: .LBB11_1: @ %atomicrmw.start
; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-ARM-NEXT: ldaexd r4, r5, [r2]
-; CHECK-ARM-NEXT: and r7, r5, r1
-; CHECK-ARM-NEXT: and r6, r4, r0
-; CHECK-ARM-NEXT: strexd r3, r6, r7, [r2]
-; CHECK-ARM-NEXT: cmp r3, #0
+; CHECK-ARM-NEXT: ldaexd r6, r7, [r12]
+; CHECK-ARM-NEXT: and r5, r7, r1
+; CHECK-ARM-NEXT: and r4, r6, r0
+; CHECK-ARM-NEXT: strexd r2, r4, r5, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
; CHECK-ARM-NEXT: bne .LBB11_1
; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-ARM-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-NEXT: strd r6, r7, [r12]
; CHECK-ARM-NEXT: pop {r4, r5, r6, r7, r11, pc}
;
; CHECK-THUMB-LABEL: test_atomic_load_and_i64:
@@ -666,18 +666,18 @@ define void @test_atomic_load_or_i64(i64 %offset) nounwind {
; CHECK-ARM-LABEL: test_atomic_load_or_i64:
; CHECK-ARM: @ %bb.0:
; CHECK-ARM-NEXT: push {r4, r5, r6, r7, r11, lr}
-; CHECK-ARM-NEXT: movw r2, :lower16:var64
-; CHECK-ARM-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-NEXT: movw r12, :lower16:var64
+; CHECK-ARM-NEXT: movt r12, :upper16:var64
; CHECK-ARM-NEXT: .LBB15_1: @ %atomicrmw.start
; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-ARM-NEXT: ldrexd r4, r5, [r2]
-; CHECK-ARM-NEXT: orr r7, r5, r1
-; CHECK-ARM-NEXT: orr r6, r4, r0
-; CHECK-ARM-NEXT: stlexd r3, r6, r7, [r2]
-; CHECK-ARM-NEXT: cmp r3, #0
+; CHECK-ARM-NEXT: ldrexd r6, r7, [r12]
+; CHECK-ARM-NEXT: orr r5, r7, r1
+; CHECK-ARM-NEXT: orr r4, r6, r0
+; CHECK-ARM-NEXT: stlexd r2, r4, r5, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
; CHECK-ARM-NEXT: bne .LBB15_1
; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-ARM-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-NEXT: strd r6, r7, [r12]
; CHECK-ARM-NEXT: pop {r4, r5, r6, r7, r11, pc}
;
; CHECK-THUMB-LABEL: test_atomic_load_or_i64:
@@ -823,18 +823,18 @@ define void @test_atomic_load_xor_i64(i64 %offset) nounwind {
; CHECK-ARM-LABEL: test_atomic_load_xor_i64:
; CHECK-ARM: @ %bb.0:
; CHECK-ARM-NEXT: push {r4, r5, r6, r7, r11, lr}
-; CHECK-ARM-NEXT: movw r2, :lower16:var64
-; CHECK-ARM-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-NEXT: movw r12, :lower16:var64
+; CHECK-ARM-NEXT: movt r12, :upper16:var64
; CHECK-ARM-NEXT: .LBB19_1: @ %atomicrmw.start
; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-ARM-NEXT: ldrexd r4, r5, [r2]
-; CHECK-ARM-NEXT: eor r7, r5, r1
-; CHECK-ARM-NEXT: eor r6, r4, r0
-; CHECK-ARM-NEXT: strexd r3, r6, r7, [r2]
-; CHECK-ARM-NEXT: cmp r3, #0
+; CHECK-ARM-NEXT: ldrexd r6, r7, [r12]
+; CHECK-ARM-NEXT: eor r5, r7, r1
+; CHECK-ARM-NEXT: eor r4, r6, r0
+; CHECK-ARM-NEXT: strexd r2, r4, r5, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
; CHECK-ARM-NEXT: bne .LBB19_1
; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-ARM-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-NEXT: strd r6, r7, [r12]
; CHECK-ARM-NEXT: pop {r4, r5, r6, r7, r11, pc}
;
; CHECK-THUMB-LABEL: test_atomic_load_xor_i64:
@@ -932,18 +932,18 @@ define void @test_atomic_load_xchg_i64(i64 %offset) nounwind {
; CHECK-ARM-LABEL: test_atomic_load_xchg_i64:
; CHECK-ARM: @ %bb.0:
; CHECK-ARM-NEXT: push {r4, r5, r11, lr}
-; CHECK-ARM-NEXT: movw r2, :lower16:var64
+; CHECK-ARM-NEXT: movw r12, :lower16:var64
; CHECK-ARM-NEXT: @ kill: def $r1 killed $r1 killed $r0_r1 def $r0_r1
-; CHECK-ARM-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-NEXT: movt r12, :upper16:var64
; CHECK-ARM-NEXT: @ kill: def $r0 killed $r0 killed $r0_r1 def $r0_r1
; CHECK-ARM-NEXT: .LBB23_1: @ %atomicrmw.start
; CHECK-ARM-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-ARM-NEXT: ldaexd r4, r5, [r2]
-; CHECK-ARM-NEXT: strexd r3, r0, r1, [r2]
-; CHECK-ARM-NEXT: cmp r3, #0
+; CHECK-ARM-NEXT: ldaexd r4, r5, [r12]
+; CHECK-ARM-NEXT: strexd r2, r0, r1, [r12]
+; CHECK-ARM-NEXT: cmp r2, #0
; CHECK-ARM-NEXT: bne .LBB23_1
; CHECK-ARM-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-ARM-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-NEXT: strd r4, r5, [r12]
; CHECK-ARM-NEXT: pop {r4, r5, r11, pc}
;
; CHECK-THUMB-LABEL: test_atomic_load_xchg_i64:
@@ -1109,49 +1109,49 @@ define void @test_atomic_load_min_i64(i64 %offset) nounwind {
; CHECK-ARM-LE-LABEL: test_atomic_load_min_i64:
; CHECK-ARM-LE: @ %bb.0:
; CHECK-ARM-LE-NEXT: push {r4, r5, r6, r7, r11, lr}
-; CHECK-ARM-LE-NEXT: movw r2, :lower16:var64
-; CHECK-ARM-LE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-LE-NEXT: movw r12, :lower16:var64
+; CHECK-ARM-LE-NEXT: movt r12, :upper16:var64
; CHECK-ARM-LE-NEXT: .LBB27_1: @ %atomicrmw.start
; CHECK-ARM-LE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-ARM-LE-NEXT: ldaexd r4, r5, [r2]
-; CHECK-ARM-LE-NEXT: mov r7, r1
-; CHECK-ARM-LE-NEXT: subs r3, r0, r4
-; CHECK-ARM-LE-NEXT: sbcs r3, r1, r5
-; CHECK-ARM-LE-NEXT: mov r3, #0
-; CHECK-ARM-LE-NEXT: movwge r3, #1
-; CHECK-ARM-LE-NEXT: cmp r3, #0
-; CHECK-ARM-LE-NEXT: movne r7, r5
-; CHECK-ARM-LE-NEXT: mov r6, r0
-; CHECK-ARM-LE-NEXT: movne r6, r4
-; CHECK-ARM-LE-NEXT: stlexd r3, r6, r7, [r2]
-; CHECK-ARM-LE-NEXT: cmp r3, #0
+; CHECK-ARM-LE-NEXT: ldaexd r6, r7, [r12]
+; CHECK-ARM-LE-NEXT: subs r5, r0, r6
+; CHECK-ARM-LE-NEXT: sbcs r5, r1, r7
+; CHECK-ARM-LE-NEXT: mov r5, #0
+; CHECK-ARM-LE-NEXT: movwge r5, #1
+; CHECK-ARM-LE-NEXT: cmp r5, #0
+; CHECK-ARM-LE-NEXT: mov r5, r1
+; CHECK-ARM-LE-NEXT: movne r5, r7
+; CHECK-ARM-LE-NEXT: mov r4, r0
+; CHECK-ARM-LE-NEXT: movne r4, r6
+; CHECK-ARM-LE-NEXT: stlexd r2, r4, r5, [r12]
+; CHECK-ARM-LE-NEXT: cmp r2, #0
; CHECK-ARM-LE-NEXT: bne .LBB27_1
; CHECK-ARM-LE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-ARM-LE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-LE-NEXT: strd r6, r7, [r12]
; CHECK-ARM-LE-NEXT: pop {r4, r5, r6, r7, r11, pc}
;
; CHECK-ARM-BE-LABEL: test_atomic_load_min_i64:
; CHECK-ARM-BE: @ %bb.0:
; CHECK-ARM-BE-NEXT: push {r4, r5, r6, r7, r11, lr}
-; CHECK-ARM-BE-NEXT: movw r2, :lower16:var64
-; CHECK-ARM-BE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-BE-NEXT: movw r12, :lower16:var64
+; CHECK-ARM-BE-NEXT: movt r12, :upper16:var64
; CHECK-ARM-BE-NEXT: .LBB27_1: @ %atomicrmw.start
; CHECK-ARM-BE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-ARM-BE-NEXT: ldaexd r4, r5, [r2]
-; CHECK-ARM-BE-NEXT: mov r7, r1
-; CHECK-ARM-BE-NEXT: subs r3, r1, r5
-; CHECK-ARM-BE-NEXT: sbcs r3, r0, r4
-; CHECK-ARM-BE-NEXT: mov r3, #0
-; CHECK-ARM-BE-NEXT: movwge r3, #1
-; CHECK-ARM-BE-NEXT: cmp r3, #0
-; CHECK-ARM-BE-NEXT: movne r7, r5
-; CHECK-ARM-BE-NEXT: mov r6, r0
-; CHECK-ARM-BE-NEXT: movne r6, r4
-; CHECK-ARM-BE-NEXT: stlexd r3, r6, r7, [r2]
-; CHECK-ARM-BE-NEXT: cmp r3, #0
+; CHECK-ARM-BE-NEXT: ldaexd r6, r7, [r12]
+; CHECK-ARM-BE-NEXT: subs r5, r1, r7
+; CHECK-ARM-BE-NEXT: sbcs r5, r0, r6
+; CHECK-ARM-BE-NEXT: mov r5, #0
+; CHECK-ARM-BE-NEXT: movwge r5, #1
+; CHECK-ARM-BE-NEXT: cmp r5, #0
+; CHECK-ARM-BE-NEXT: mov r5, r1
+; CHECK-ARM-BE-NEXT: movne r5, r7
+; CHECK-ARM-BE-NEXT: mov r4, r0
+; CHECK-ARM-BE-NEXT: movne r4, r6
+; CHECK-ARM-BE-NEXT: stlexd r2, r4, r5, [r12]
+; CHECK-ARM-BE-NEXT: cmp r2, #0
; CHECK-ARM-BE-NEXT: bne .LBB27_1
; CHECK-ARM-BE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-ARM-BE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-BE-NEXT: strd r6, r7, [r12]
; CHECK-ARM-BE-NEXT: pop {r4, r5, r6, r7, r11, pc}
;
; CHECK-THUMB-LE-LABEL: test_atomic_load_min_i64:
@@ -1354,49 +1354,49 @@ define void @test_atomic_load_max_i64(i64 %offset) nounwind {
; CHECK-ARM-LE-LABEL: test_atomic_load_max_i64:
; CHECK-ARM-LE: @ %bb.0:
; CHECK-ARM-LE-NEXT: push {r4, r5, r6, r7, r11, lr}
-; CHECK-ARM-LE-NEXT: movw r2, :lower16:var64
-; CHECK-ARM-LE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-LE-NEXT: movw r12, :lower16:var64
+; CHECK-ARM-LE-NEXT: movt r12, :upper16:var64
; CHECK-ARM-LE-NEXT: .LBB31_1: @ %atomicrmw.start
; CHECK-ARM-LE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-ARM-LE-NEXT: ldrexd r4, r5, [r2]
-; CHECK-ARM-LE-NEXT: mov r7, r1
-; CHECK-ARM-LE-NEXT: subs r3, r0, r4
-; CHECK-ARM-LE-NEXT: sbcs r3, r1, r5
-; CHECK-ARM-LE-NEXT: mov r3, #0
-; CHECK-ARM-LE-NEXT: movwlt r3, #1
-; CHECK-ARM-LE-NEXT: cmp r3, #0
-; CHECK-ARM-LE-NEXT: movne r7, r5
-; CHECK-ARM-LE-NEXT: mov r6, r0
-; CHECK-ARM-LE-NEXT: movne r6, r4
-; CHECK-ARM-LE-NEXT: strexd r3, r6, r7, [r2]
-; CHECK-ARM-LE-NEXT: cmp r3, #0
+; CHECK-ARM-LE-NEXT: ldrexd r6, r7, [r12]
+; CHECK-ARM-LE-NEXT: subs r5, r0, r6
+; CHECK-ARM-LE-NEXT: sbcs r5, r1, r7
+; CHECK-ARM-LE-NEXT: mov r5, #0
+; CHECK-ARM-LE-NEXT: movwlt r5, #1
+; CHECK-ARM-LE-NEXT: cmp r5, #0
+; CHECK-ARM-LE-NEXT: mov r5, r1
+; CHECK-ARM-LE-NEXT: movne r5, r7
+; CHECK-ARM-LE-NEXT: mov r4, r0
+; CHECK-ARM-LE-NEXT: movne r4, r6
+; CHECK-ARM-LE-NEXT: strexd r2, r4, r5, [r12]
+; CHECK-ARM-LE-NEXT: cmp r2, #0
; CHECK-ARM-LE-NEXT: bne .LBB31_1
; CHECK-ARM-LE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-ARM-LE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-LE-NEXT: strd r6, r7, [r12]
; CHECK-ARM-LE-NEXT: pop {r4, r5, r6, r7, r11, pc}
;
; CHECK-ARM-BE-LABEL: test_atomic_load_max_i64:
; CHECK-ARM-BE: @ %bb.0:
; CHECK-ARM-BE-NEXT: push {r4, r5, r6, r7, r11, lr}
-; CHECK-ARM-BE-NEXT: movw r2, :lower16:var64
-; CHECK-ARM-BE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-BE-NEXT: movw r12, :lower16:var64
+; CHECK-ARM-BE-NEXT: movt r12, :upper16:var64
; CHECK-ARM-BE-NEXT: .LBB31_1: @ %atomicrmw.start
; CHECK-ARM-BE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-ARM-BE-NEXT: ldrexd r4, r5, [r2]
-; CHECK-ARM-BE-NEXT: mov r7, r1
-; CHECK-ARM-BE-NEXT: subs r3, r1, r5
-; CHECK-ARM-BE-NEXT: sbcs r3, r0, r4
-; CHECK-ARM-BE-NEXT: mov r3, #0
-; CHECK-ARM-BE-NEXT: movwlt r3, #1
-; CHECK-ARM-BE-NEXT: cmp r3, #0
-; CHECK-ARM-BE-NEXT: movne r7, r5
-; CHECK-ARM-BE-NEXT: mov r6, r0
-; CHECK-ARM-BE-NEXT: movne r6, r4
-; CHECK-ARM-BE-NEXT: strexd r3, r6, r7, [r2]
-; CHECK-ARM-BE-NEXT: cmp r3, #0
+; CHECK-ARM-BE-NEXT: ldrexd r6, r7, [r12]
+; CHECK-ARM-BE-NEXT: subs r5, r1, r7
+; CHECK-ARM-BE-NEXT: sbcs r5, r0, r6
+; CHECK-ARM-BE-NEXT: mov r5, #0
+; CHECK-ARM-BE-NEXT: movwlt r5, #1
+; CHECK-ARM-BE-NEXT: cmp r5, #0
+; CHECK-ARM-BE-NEXT: mov r5, r1
+; CHECK-ARM-BE-NEXT: movne r5, r7
+; CHECK-ARM-BE-NEXT: mov r4, r0
+; CHECK-ARM-BE-NEXT: movne r4, r6
+; CHECK-ARM-BE-NEXT: strexd r2, r4, r5, [r12]
+; CHECK-ARM-BE-NEXT: cmp r2, #0
; CHECK-ARM-BE-NEXT: bne .LBB31_1
; CHECK-ARM-BE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-ARM-BE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-BE-NEXT: strd r6, r7, [r12]
; CHECK-ARM-BE-NEXT: pop {r4, r5, r6, r7, r11, pc}
;
; CHECK-THUMB-LE-LABEL: test_atomic_load_max_i64:
@@ -1595,49 +1595,49 @@ define void @test_atomic_load_umin_i64(i64 %offset) nounwind {
; CHECK-ARM-LE-LABEL: test_atomic_load_umin_i64:
; CHECK-ARM-LE: @ %bb.0:
; CHECK-ARM-LE-NEXT: push {r4, r5, r6, r7, r11, lr}
-; CHECK-ARM-LE-NEXT: movw r2, :lower16:var64
-; CHECK-ARM-LE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-LE-NEXT: movw r12, :lower16:var64
+; CHECK-ARM-LE-NEXT: movt r12, :upper16:var64
; CHECK-ARM-LE-NEXT: .LBB35_1: @ %atomicrmw.start
; CHECK-ARM-LE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-ARM-LE-NEXT: ldaexd r4, r5, [r2]
-; CHECK-ARM-LE-NEXT: mov r7, r1
-; CHECK-ARM-LE-NEXT: subs r3, r0, r4
-; CHECK-ARM-LE-NEXT: sbcs r3, r1, r5
-; CHECK-ARM-LE-NEXT: mov r3, #0
-; CHECK-ARM-LE-NEXT: movwhs r3, #1
-; CHECK-ARM-LE-NEXT: cmp r3, #0
-; CHECK-ARM-LE-NEXT: movne r7, r5
-; CHECK-ARM-LE-NEXT: mov r6, r0
-; CHECK-ARM-LE-NEXT: movne r6, r4
-; CHECK-ARM-LE-NEXT: stlexd r3, r6, r7, [r2]
-; CHECK-ARM-LE-NEXT: cmp r3, #0
+; CHECK-ARM-LE-NEXT: ldaexd r6, r7, [r12]
+; CHECK-ARM-LE-NEXT: subs r5, r0, r6
+; CHECK-ARM-LE-NEXT: sbcs r5, r1, r7
+; CHECK-ARM-LE-NEXT: mov r5, #0
+; CHECK-ARM-LE-NEXT: movwhs r5, #1
+; CHECK-ARM-LE-NEXT: cmp r5, #0
+; CHECK-ARM-LE-NEXT: mov r5, r1
+; CHECK-ARM-LE-NEXT: movne r5, r7
+; CHECK-ARM-LE-NEXT: mov r4, r0
+; CHECK-ARM-LE-NEXT: movne r4, r6
+; CHECK-ARM-LE-NEXT: stlexd r2, r4, r5, [r12]
+; CHECK-ARM-LE-NEXT: cmp r2, #0
; CHECK-ARM-LE-NEXT: bne .LBB35_1
; CHECK-ARM-LE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-ARM-LE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-LE-NEXT: strd r6, r7, [r12]
; CHECK-ARM-LE-NEXT: pop {r4, r5, r6, r7, r11, pc}
;
; CHECK-ARM-BE-LABEL: test_atomic_load_umin_i64:
; CHECK-ARM-BE: @ %bb.0:
; CHECK-ARM-BE-NEXT: push {r4, r5, r6, r7, r11, lr}
-; CHECK-ARM-BE-NEXT: movw r2, :lower16:var64
-; CHECK-ARM-BE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-BE-NEXT: movw r12, :lower16:var64
+; CHECK-ARM-BE-NEXT: movt r12, :upper16:var64
; CHECK-ARM-BE-NEXT: .LBB35_1: @ %atomicrmw.start
; CHECK-ARM-BE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-ARM-BE-NEXT: ldaexd r4, r5, [r2]
-; CHECK-ARM-BE-NEXT: mov r7, r1
-; CHECK-ARM-BE-NEXT: subs r3, r1, r5
-; CHECK-ARM-BE-NEXT: sbcs r3, r0, r4
-; CHECK-ARM-BE-NEXT: mov r3, #0
-; CHECK-ARM-BE-NEXT: movwhs r3, #1
-; CHECK-ARM-BE-NEXT: cmp r3, #0
-; CHECK-ARM-BE-NEXT: movne r7, r5
-; CHECK-ARM-BE-NEXT: mov r6, r0
-; CHECK-ARM-BE-NEXT: movne r6, r4
-; CHECK-ARM-BE-NEXT: stlexd r3, r6, r7, [r2]
-; CHECK-ARM-BE-NEXT: cmp r3, #0
+; CHECK-ARM-BE-NEXT: ldaexd r6, r7, [r12]
+; CHECK-ARM-BE-NEXT: subs r5, r1, r7
+; CHECK-ARM-BE-NEXT: sbcs r5, r0, r6
+; CHECK-ARM-BE-NEXT: mov r5, #0
+; CHECK-ARM-BE-NEXT: movwhs r5, #1
+; CHECK-ARM-BE-NEXT: cmp r5, #0
+; CHECK-ARM-BE-NEXT: mov r5, r1
+; CHECK-ARM-BE-NEXT: movne r5, r7
+; CHECK-ARM-BE-NEXT: mov r4, r0
+; CHECK-ARM-BE-NEXT: movne r4, r6
+; CHECK-ARM-BE-NEXT: stlexd r2, r4, r5, [r12]
+; CHECK-ARM-BE-NEXT: cmp r2, #0
; CHECK-ARM-BE-NEXT: bne .LBB35_1
; CHECK-ARM-BE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-ARM-BE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-BE-NEXT: strd r6, r7, [r12]
; CHECK-ARM-BE-NEXT: pop {r4, r5, r6, r7, r11, pc}
;
; CHECK-THUMB-LE-LABEL: test_atomic_load_umin_i64:
@@ -1836,49 +1836,49 @@ define void @test_atomic_load_umax_i64(i64 %offset) nounwind {
; CHECK-ARM-LE-LABEL: test_atomic_load_umax_i64:
; CHECK-ARM-LE: @ %bb.0:
; CHECK-ARM-LE-NEXT: push {r4, r5, r6, r7, r11, lr}
-; CHECK-ARM-LE-NEXT: movw r2, :lower16:var64
-; CHECK-ARM-LE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-LE-NEXT: movw r12, :lower16:var64
+; CHECK-ARM-LE-NEXT: movt r12, :upper16:var64
; CHECK-ARM-LE-NEXT: .LBB39_1: @ %atomicrmw.start
; CHECK-ARM-LE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-ARM-LE-NEXT: ldaexd r4, r5, [r2]
-; CHECK-ARM-LE-NEXT: mov r7, r1
-; CHECK-ARM-LE-NEXT: subs r3, r0, r4
-; CHECK-ARM-LE-NEXT: sbcs r3, r1, r5
-; CHECK-ARM-LE-NEXT: mov r3, #0
-; CHECK-ARM-LE-NEXT: movwlo r3, #1
-; CHECK-ARM-LE-NEXT: cmp r3, #0
-; CHECK-ARM-LE-NEXT: movne r7, r5
-; CHECK-ARM-LE-NEXT: mov r6, r0
-; CHECK-ARM-LE-NEXT: movne r6, r4
-; CHECK-ARM-LE-NEXT: stlexd r3, r6, r7, [r2]
-; CHECK-ARM-LE-NEXT: cmp r3, #0
+; CHECK-ARM-LE-NEXT: ldaexd r6, r7, [r12]
+; CHECK-ARM-LE-NEXT: subs r5, r0, r6
+; CHECK-ARM-LE-NEXT: sbcs r5, r1, r7
+; CHECK-ARM-LE-NEXT: mov r5, #0
+; CHECK-ARM-LE-NEXT: movwlo r5, #1
+; CHECK-ARM-LE-NEXT: cmp r5, #0
+; CHECK-ARM-LE-NEXT: mov r5, r1
+; CHECK-ARM-LE-NEXT: movne r5, r7
+; CHECK-ARM-LE-NEXT: mov r4, r0
+; CHECK-ARM-LE-NEXT: movne r4, r6
+; CHECK-ARM-LE-NEXT: stlexd r2, r4, r5, [r12]
+; CHECK-ARM-LE-NEXT: cmp r2, #0
; CHECK-ARM-LE-NEXT: bne .LBB39_1
; CHECK-ARM-LE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-ARM-LE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-LE-NEXT: strd r6, r7, [r12]
; CHECK-ARM-LE-NEXT: pop {r4, r5, r6, r7, r11, pc}
;
; CHECK-ARM-BE-LABEL: test_atomic_load_umax_i64:
; CHECK-ARM-BE: @ %bb.0:
; CHECK-ARM-BE-NEXT: push {r4, r5, r6, r7, r11, lr}
-; CHECK-ARM-BE-NEXT: movw r2, :lower16:var64
-; CHECK-ARM-BE-NEXT: movt r2, :upper16:var64
+; CHECK-ARM-BE-NEXT: movw r12, :lower16:var64
+; CHECK-ARM-BE-NEXT: movt r12, :upper16:var64
; CHECK-ARM-BE-NEXT: .LBB39_1: @ %atomicrmw.start
; CHECK-ARM-BE-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-ARM-BE-NEXT: ldaexd r4, r5, [r2]
-; CHECK-ARM-BE-NEXT: mov r7, r1
-; CHECK-ARM-BE-NEXT: subs r3, r1, r5
-; CHECK-ARM-BE-NEXT: sbcs r3, r0, r4
-; CHECK-ARM-BE-NEXT: mov r3, #0
-; CHECK-ARM-BE-NEXT: movwlo r3, #1
-; CHECK-ARM-BE-NEXT: cmp r3, #0
-; CHECK-ARM-BE-NEXT: movne r7, r5
-; CHECK-ARM-BE-NEXT: mov r6, r0
-; CHECK-ARM-BE-NEXT: movne r6, r4
-; CHECK-ARM-BE-NEXT: stlexd r3, r6, r7, [r2]
-; CHECK-ARM-BE-NEXT: cmp r3, #0
+; CHECK-ARM-BE-NEXT: ldaexd r6, r7, [r12]
+; CHECK-ARM-BE-NEXT: subs r5, r1, r7
+; CHECK-ARM-BE-NEXT: sbcs r5, r0, r6
+; CHECK-ARM-BE-NEXT: mov r5, #0
+; CHECK-ARM-BE-NEXT: movwlo r5, #1
+; CHECK-ARM-BE-NEXT: cmp r5, #0
+; CHECK-ARM-BE-NEXT: mov r5, r1
+; CHECK-ARM-BE-NEXT: movne r5, r7
+; CHECK-ARM-BE-NEXT: mov r4, r0
+; CHECK-ARM-BE-NEXT: movne r4, r6
+; CHECK-ARM-BE-NEXT: stlexd r2, r4, r5, [r12]
+; CHECK-ARM-BE-NEXT: cmp r2, #0
; CHECK-ARM-BE-NEXT: bne .LBB39_1
; CHECK-ARM-BE-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-ARM-BE-NEXT: strd r4, r5, [r2]
+; CHECK-ARM-BE-NEXT: strd r6, r7, [r12]
; CHECK-ARM-BE-NEXT: pop {r4, r5, r6, r7, r11, pc}
;
; CHECK-THUMB-LE-LABEL: test_atomic_load_umax_i64:
diff --git a/llvm/test/CodeGen/ARM/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/ARM/atomicrmw-uinc-udec-wrap.ll
index 243ec4deecdb84..f4e25916694ed5 100644
--- a/llvm/test/CodeGen/ARM/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/ARM/atomicrmw-uinc-udec-wrap.ll
@@ -69,29 +69,28 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
; CHECK-LABEL: atomicrmw_uinc_wrap_i64:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr}
-; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr}
+; CHECK-NEXT: .save {r4, r6, r7, lr}
+; CHECK-NEXT: push {r4, r6, r7, lr}
+; CHECK-NEXT: mov r12, r0
; CHECK-NEXT: dmb ish
; CHECK-NEXT: .LBB3_1: @ %atomicrmw.start
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldrexd r4, r5, [r0]
-; CHECK-NEXT: adds r6, r4, #1
-; CHECK-NEXT: adc r7, r5, #0
-; CHECK-NEXT: subs r1, r4, r2
-; CHECK-NEXT: sbcs r1, r5, r3
-; CHECK-NEXT: mov r1, #0
-; CHECK-NEXT: movwhs r1, #1
-; CHECK-NEXT: cmp r1, #0
+; CHECK-NEXT: ldrexd r0, r1, [r12]
+; CHECK-NEXT: adds r6, r0, #1
+; CHECK-NEXT: adc r7, r1, #0
+; CHECK-NEXT: subs r4, r0, r2
+; CHECK-NEXT: sbcs r4, r1, r3
+; CHECK-NEXT: mov r4, #0
+; CHECK-NEXT: movwhs r4, #1
+; CHECK-NEXT: cmp r4, #0
; CHECK-NEXT: movwne r7, #0
; CHECK-NEXT: movwne r6, #0
-; CHECK-NEXT: strexd r1, r6, r7, [r0]
-; CHECK-NEXT: cmp r1, #0
+; CHECK-NEXT: strexd r4, r6, r7, [r12]
+; CHECK-NEXT: cmp r4, #0
; CHECK-NEXT: bne .LBB3_1
; CHECK-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-NEXT: mov r0, r4
-; CHECK-NEXT: mov r1, r5
; CHECK-NEXT: dmb ish
-; CHECK-NEXT: pop {r4, r5, r6, r7, r11, pc}
+; CHECK-NEXT: pop {r4, r6, r7, pc}
%result = atomicrmw uinc_wrap ptr %ptr, i64 %val seq_cst
ret i64 %result
}
@@ -172,29 +171,28 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
; CHECK: @ %bb.0:
; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr}
+; CHECK-NEXT: mov r12, r0
; CHECK-NEXT: dmb ish
; CHECK-NEXT: .LBB7_1: @ %atomicrmw.start
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldrexd r4, r5, [r0]
-; CHECK-NEXT: mov r12, #0
-; CHECK-NEXT: subs r1, r2, r4
-; CHECK-NEXT: sbcs r1, r3, r5
-; CHECK-NEXT: orr r1, r4, r5
-; CHECK-NEXT: clz r1, r1
-; CHECK-NEXT: movwlo r12, #1
-; CHECK-NEXT: lsr r1, r1, #5
-; CHECK-NEXT: subs r6, r4, #1
-; CHECK-NEXT: sbc r7, r5, #0
-; CHECK-NEXT: orr r1, r1, r12
-; CHECK-NEXT: cmp r1, #0
+; CHECK-NEXT: ldrexd r0, r1, [r12]
+; CHECK-NEXT: orr r4, r0, r1
+; CHECK-NEXT: subs r5, r2, r0
+; CHECK-NEXT: clz r4, r4
+; CHECK-NEXT: sbcs r5, r3, r1
+; CHECK-NEXT: lsr r4, r4, #5
+; CHECK-NEXT: mov r5, #0
+; CHECK-NEXT: movwlo r5, #1
+; CHECK-NEXT: subs r6, r0, #1
+; CHECK-NEXT: orr lr, r4, r5
+; CHECK-NEXT: sbc r7, r1, #0
+; CHECK-NEXT: cmp lr, #0
; CHECK-NEXT: movne r7, r3
; CHECK-NEXT: movne r6, r2
-; CHECK-NEXT: strexd r1, r6, r7, [r0]
-; CHECK-NEXT: cmp r1, #0
+; CHECK-NEXT: strexd r4, r6, r7, [r12]
+; CHECK-NEXT: cmp r4, #0
; CHECK-NEXT: bne .LBB7_1
; CHECK-NEXT: @ %bb.2: @ %atomicrmw.end
-; CHECK-NEXT: mov r0, r4
-; CHECK-NEXT: mov r1, r5
; CHECK-NEXT: dmb ish
; CHECK-NEXT: pop {r4, r5, r6, r7, r11, pc}
%result = atomicrmw udec_wrap ptr %ptr, i64 %val seq_cst
>From e0f3e2f55ff4f0a451e254badeae77592fe36e36 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard at arm.com>
Date: Thu, 15 Aug 2024 09:33:29 +0100
Subject: [PATCH 4/4] Update AMDGPU tests
---
llvm/test/CodeGen/AMDGPU/collapse-endcf.ll | 227 +-
llvm/test/CodeGen/AMDGPU/div_i128.ll | 366 +-
.../AMDGPU/memory-legalizer-global-agent.ll | 3244 ++++++++---------
.../memory-legalizer-global-nontemporal.ll | 24 +-
.../memory-legalizer-global-singlethread.ll | 3244 ++++++++---------
.../AMDGPU/memory-legalizer-global-system.ll | 2812 +++++++-------
.../memory-legalizer-global-volatile.ll | 24 +-
.../memory-legalizer-global-wavefront.ll | 3244 ++++++++---------
.../memory-legalizer-global-workgroup.ll | 3244 ++++++++---------
...rlapping-tuple-copy-implicit-op-failure.ll | 80 +-
llvm/test/CodeGen/AMDGPU/rem_i128.ll | 208 +-
.../AMDGPU/sgpr-spills-split-regalloc.ll | 29 +-
12 files changed, 8371 insertions(+), 8375 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index 75f5eda608e80a..221f4c55d998da 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -90,9 +90,9 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: v_mov_b32_e32 v2, v1
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_mov_b32 s0, 2
-; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s0
-; GCN-O0-NEXT: v_mov_b32_e32 v2, 0
-; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64
+; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s0
+; GCN-O0-NEXT: v_mov_b32_e32 v4, 0
+; GCN-O0-NEXT: buffer_store_dword v4, v[2:3], s[4:7], 0 addr64
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v1, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4
@@ -105,28 +105,28 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_cbranch_execz .LBB0_3
; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT: s_waitcnt expcnt(0)
-; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
-; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
-; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
-; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
+; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
+; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
+; GCN-O0-NEXT: s_waitcnt expcnt(0)
+; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT: v_mov_b32_e32 v2, 1
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
-; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
-; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
-; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
+; GCN-O0-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2
+; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v0
+; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GCN-O0-NEXT: v_mov_b32_e32 v1, v3
; GCN-O0-NEXT: s_mov_b32 s2, 2
-; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2
+; GCN-O0-NEXT: v_lshl_b64 v[0:1], v[0:1], s2
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-O0-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GCN-O0-NEXT: .LBB0_3: ; %Flow
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
@@ -264,9 +264,9 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
; GCN-O0-NEXT: v_mov_b32_e32 v2, v1
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_mov_b32 s0, 2
-; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s0
-; GCN-O0-NEXT: v_mov_b32_e32 v2, 0
-; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64
+; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s0
+; GCN-O0-NEXT: v_mov_b32_e32 v4, 0
+; GCN-O0-NEXT: buffer_store_dword v4, v[2:3], s[4:7], 0 addr64
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v1, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4
@@ -279,28 +279,28 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
; GCN-O0-NEXT: s_cbranch_execz .LBB1_4
; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT: s_waitcnt expcnt(0)
-; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
-; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
-; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
-; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
+; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
+; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
+; GCN-O0-NEXT: s_waitcnt expcnt(0)
+; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT: v_mov_b32_e32 v2, 1
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
-; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
-; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
-; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
+; GCN-O0-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2
+; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v0
+; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GCN-O0-NEXT: v_mov_b32_e32 v1, v3
; GCN-O0-NEXT: s_mov_b32 s2, 2
-; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2
+; GCN-O0-NEXT: v_lshl_b64 v[0:1], v[0:1], s2
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-O0-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GCN-O0-NEXT: s_branch .LBB1_4
; GCN-O0-NEXT: .LBB1_3: ; %Flow
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
@@ -314,30 +314,30 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
; GCN-O0-NEXT: s_branch .LBB1_5
; GCN-O0-NEXT: .LBB1_4: ; %bb.inner.end
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT: s_waitcnt expcnt(0)
-; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
-; GCN-O0-NEXT: v_readlane_b32 s2, v0, 4
-; GCN-O0-NEXT: v_readlane_b32 s3, v0, 5
+; GCN-O0-NEXT: v_readlane_b32 s2, v1, 4
+; GCN-O0-NEXT: v_readlane_b32 s3, v1, 5
; GCN-O0-NEXT: s_or_b64 exec, exec, s[2:3]
-; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
-; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
-; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
+; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
+; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
+; GCN-O0-NEXT: s_waitcnt expcnt(0)
+; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT: v_mov_b32_e32 v2, 2
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
-; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
-; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
-; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
-; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], v0
+; GCN-O0-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2
+; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v0
+; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GCN-O0-NEXT: v_mov_b32_e32 v1, v3
+; GCN-O0-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-O0-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GCN-O0-NEXT: s_branch .LBB1_3
; GCN-O0-NEXT: .LBB1_5: ; %bb.outer.end
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
@@ -453,15 +453,15 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: s_mov_b32 s4, 2
-; GCN-O0-NEXT: v_lshlrev_b32_e64 v3, s4, v1
+; GCN-O0-NEXT: s_waitcnt expcnt(0)
+; GCN-O0-NEXT: v_lshlrev_b32_e64 v2, s4, v1
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; implicit-def: $sgpr4
-; GCN-O0-NEXT: s_waitcnt expcnt(0)
-; GCN-O0-NEXT: v_mov_b32_e32 v2, 0
-; GCN-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
-; GCN-O0-NEXT: v_mov_b32_e32 v4, v2
-; GCN-O0-NEXT: v_mov_b32_e32 v2, 0
-; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64
+; GCN-O0-NEXT: v_mov_b32_e32 v4, 0
+; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
+; GCN-O0-NEXT: v_mov_b32_e32 v4, 0
+; GCN-O0-NEXT: buffer_store_dword v4, v[2:3], s[0:3], 0 addr64
; GCN-O0-NEXT: s_mov_b32 s0, 1
; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
@@ -512,52 +512,52 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_cbranch_execz .LBB2_5
; GCN-O0-NEXT: ; %bb.3: ; %bb.then
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT: s_waitcnt expcnt(0)
-; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
-; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
-; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
-; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
+; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
+; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
+; GCN-O0-NEXT: s_waitcnt expcnt(0)
+; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT: v_mov_b32_e32 v2, 1
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
-; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
-; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
-; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
+; GCN-O0-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2
+; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v0
+; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GCN-O0-NEXT: v_mov_b32_e32 v1, v3
; GCN-O0-NEXT: s_mov_b32 s2, 2
-; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2
+; GCN-O0-NEXT: v_lshl_b64 v[0:1], v[0:1], s2
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-O0-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GCN-O0-NEXT: s_branch .LBB2_5
; GCN-O0-NEXT: .LBB2_4: ; %bb.else
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT: s_waitcnt expcnt(0)
-; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
-; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
-; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
-; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
+; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
+; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
+; GCN-O0-NEXT: s_waitcnt expcnt(0)
+; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT: v_mov_b32_e32 v2, 2
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
-; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
-; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
-; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
-; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], v0
+; GCN-O0-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2
+; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v0
+; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GCN-O0-NEXT: v_mov_b32_e32 v1, v3
+; GCN-O0-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-O0-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GCN-O0-NEXT: s_branch .LBB2_2
; GCN-O0-NEXT: .LBB2_5: ; %Flow1
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -690,34 +690,35 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: v_mov_b32_e32 v2, v1
; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s0, 2
-; GCN-O0-NEXT: v_lshlrev_b32_e64 v3, s0, v1
+; GCN-O0-NEXT: s_waitcnt expcnt(0)
+; GCN-O0-NEXT: v_lshlrev_b32_e64 v2, s0, v1
; GCN-O0-NEXT: s_mov_b32 s1, 0
; GCN-O0-NEXT: ; implicit-def: $sgpr1
-; GCN-O0-NEXT: s_waitcnt expcnt(0)
-; GCN-O0-NEXT: v_mov_b32_e32 v2, 0
-; GCN-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
-; GCN-O0-NEXT: v_mov_b32_e32 v4, v2
+; GCN-O0-NEXT: v_mov_b32_e32 v4, 0
+; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s2, s4
-; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
+; GCN-O0-NEXT: v_mov_b32_e32 v4, v2
; GCN-O0-NEXT: s_mov_b32 s1, s5
-; GCN-O0-NEXT: v_mov_b32_e32 v6, v4
-; GCN-O0-NEXT: v_add_i32_e64 v5, s[2:3], s2, v2
-; GCN-O0-NEXT: v_mov_b32_e32 v2, s1
-; GCN-O0-NEXT: v_addc_u32_e64 v2, s[2:3], v2, v6, s[2:3]
-; GCN-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GCN-O0-NEXT: v_mov_b32_e32 v6, v2
-; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT: v_mov_b32_e32 v6, v3
+; GCN-O0-NEXT: v_add_i32_e64 v4, s[2:3], s2, v4
+; GCN-O0-NEXT: v_mov_b32_e32 v5, s1
+; GCN-O0-NEXT: v_addc_u32_e64 v6, s[2:3], v5, v6, s[2:3]
+; GCN-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GCN-O0-NEXT: v_mov_b32_e32 v5, v6
+; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
-; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s1, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GCN-O0-NEXT: v_mov_b32_e32 v2, 0
-; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64
+; GCN-O0-NEXT: s_waitcnt expcnt(1)
+; GCN-O0-NEXT: v_mov_b32_e32 v4, 0
+; GCN-O0-NEXT: buffer_store_dword v4, v[2:3], s[4:7], 0 addr64
; GCN-O0-NEXT: v_cmp_lt_u32_e64 s[0:1], v1, s0
; GCN-O0-NEXT: s_mov_b64 s[2:3], exec
; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
@@ -753,8 +754,8 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
-; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: s_mov_b32 s4, s2
@@ -763,9 +764,9 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT: v_mov_b32_e32 v2, 1
+; GCN-O0-NEXT: v_mov_b32_e32 v4, 1
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
-; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 offset:4
+; GCN-O0-NEXT: buffer_store_dword v4, v[2:3], s[0:3], 0 addr64 offset:4
; GCN-O0-NEXT: s_mov_b32 s0, 2
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
@@ -778,9 +779,9 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB3_7
; GCN-O0-NEXT: ; %bb.3: ; %bb.inner.then
-; GCN-O0-NEXT: s_waitcnt expcnt(1)
-; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT: s_waitcnt expcnt(0)
+; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: s_mov_b32 s4, s2
@@ -789,10 +790,9 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT: s_waitcnt expcnt(0)
-; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
+; GCN-O0-NEXT: v_mov_b32_e32 v2, 2
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
-; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
+; GCN-O0-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:8
; GCN-O0-NEXT: s_branch .LBB3_7
; GCN-O0-NEXT: .LBB3_4: ; %bb.outer.else
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
@@ -800,8 +800,8 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
-; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s1, 0xf000
; GCN-O0-NEXT: s_mov_b32 s0, 0
; GCN-O0-NEXT: s_mov_b32 s2, s0
@@ -810,9 +810,9 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_mov_b32 s5, s0
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GCN-O0-NEXT: v_mov_b32_e32 v2, 3
+; GCN-O0-NEXT: v_mov_b32_e32 v4, 3
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
-; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 offset:12
+; GCN-O0-NEXT: buffer_store_dword v4, v[2:3], s[4:7], 0 addr64 offset:12
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v0, s0, 6
@@ -824,9 +824,9 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB3_6
; GCN-O0-NEXT: ; %bb.5: ; %bb.inner.then2
-; GCN-O0-NEXT: s_waitcnt expcnt(1)
-; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT: s_waitcnt expcnt(0)
+; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: s_mov_b32 s4, s2
@@ -835,10 +835,9 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT: s_waitcnt expcnt(0)
-; GCN-O0-NEXT: v_mov_b32_e32 v0, 4
+; GCN-O0-NEXT: v_mov_b32_e32 v2, 4
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
-; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:16
+; GCN-O0-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:16
; GCN-O0-NEXT: .LBB3_6: ; %Flow
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
@@ -977,9 +976,9 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v1, v2
; GCN-O0-NEXT: s_mov_b32 s4, 2
-; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[0:1], s4
-; GCN-O0-NEXT: v_mov_b32_e32 v0, 0
-; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-O0-NEXT: v_lshl_b64 v[0:1], v[0:1], s4
+; GCN-O0-NEXT: v_mov_b32_e32 v2, 0
+; GCN-O0-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GCN-O0-NEXT: .LBB4_2: ; %bb.end
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index fea1303d0a2b76..8a2561075113fc 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -915,45 +915,45 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(9)
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[21:22]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v12, v[19:20]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v7
; GFX9-O0-NEXT: s_mov_b32 s6, 64
-; GFX9-O0-NEXT: v_sub_u32_e64 v12, s6, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], v12, v[19:20]
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v24
-; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v12
+; GFX9-O0-NEXT: v_sub_u32_e64 v21, s6, v12
+; GFX9-O0-NEXT: v_lshlrev_b64 v[21:22], v21, v[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v23, v22
+; GFX9-O0-NEXT: v_or_b32_e64 v15, v15, v23
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v23
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21
; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7
-; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v4, s6
-; GFX9-O0-NEXT: v_sub_u32_e64 v5, v4, s6
-; GFX9-O0-NEXT: v_lshrrev_b64 v[23:24], v5, v[19:20]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15
+; GFX9-O0-NEXT: v_mov_b32_e32 v23, v7
+; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v12, s6
+; GFX9-O0-NEXT: v_sub_u32_e64 v15, v12, s6
+; GFX9-O0-NEXT: v_lshrrev_b64 v[21:22], v15, v[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v22
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v15, v15, v23, s[4:5]
; GFX9-O0-NEXT: s_mov_b32 s6, 0
-; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v22
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7]
+; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v12, s6
+; GFX9-O0-NEXT: v_mov_b32_e32 v23, v20
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v15, v15, v23, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v23
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v21
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v19
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7]
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[19:20]
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15
+; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v12, v[4:5]
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
; GFX9-O0-NEXT: s_mov_b32 s8, s7
@@ -1036,8 +1036,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -1047,21 +1047,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
; GFX9-O0-NEXT: s_mov_b32 s4, s7
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
; GFX9-O0-NEXT: s_mov_b32 s8, s6
; GFX9-O0-NEXT: s_mov_b32 s9, s7
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-O0-NEXT: v_add_co_u32_e32 v9, vcc, v4, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v5, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s5
+; GFX9-O0-NEXT: v_add_co_u32_e32 v9, vcc, v8, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s4
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v6, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s8
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s9
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v6, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
@@ -1069,7 +1069,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
@@ -1081,12 +1081,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f
-; GFX9-O0-NEXT: v_sub_u32_e64 v3, s4, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v3, v[11:12]
+; GFX9-O0-NEXT: v_sub_u32_e64 v7, s4, v8
+; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v7, v[11:12]
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6
; GFX9-O0-NEXT: s_mov_b32 s4, 64
-; GFX9-O0-NEXT: v_sub_u32_e64 v14, s4, v3
-; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v14, v[7:8]
+; GFX9-O0-NEXT: v_sub_u32_e64 v14, s4, v7
+; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v14, v[3:4]
; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15
; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v16
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
@@ -1095,16 +1095,16 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6
-; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v3, s4
+; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v7, s4
; GFX9-O0-NEXT: s_mov_b32 s10, 63
-; GFX9-O0-NEXT: v_sub_u32_e64 v4, s10, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v4, v[7:8]
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[4:5]
+; GFX9-O0-NEXT: v_sub_u32_e64 v8, s10, v8
+; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v8, v[3:4]
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v15, s[4:5]
; GFX9-O0-NEXT: s_mov_b32 s10, 0
-; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v3, s10
+; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v7, s10
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[10:11]
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v15, s[10:11]
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13
; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5]
@@ -1113,8 +1113,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr10
; GFX9-O0-NEXT: ; implicit-def: $sgpr10
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[7:8], v3, v[7:8]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT: v_lshlrev_b64 v[7:8], v7, v[3:4]
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s9
; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5]
@@ -1756,13 +1756,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4
; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v7
; GFX9-G-O0-NEXT: s_mov_b32 s4, 1
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-G-O0-NEXT: v_lshlrev_b64 v[10:11], v0, v[2:3]
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[4:5]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[10:11], v4, v[2:3]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr2 killed $exec
; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
; GFX9-G-O0-NEXT: s_mov_b32 s4, 31
@@ -1863,13 +1863,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16)
; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2
; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5
; GFX9-G-O0-NEXT: s_mov_b32 s8, 1
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-G-O0-NEXT: v_lshlrev_b64 v[14:15], v2, v[0:1]
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-G-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[3:4]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s8
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[14:15], v4, v[0:1]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s8
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[4:5], v4, v[2:3]
; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec
; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec
; GFX9-G-O0-NEXT: s_mov_b32 s9, 31
@@ -1897,12 +1897,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v0, v1
; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v23
; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v24
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v25
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v26
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-G-O0-NEXT: v_lshlrev_b64 v[27:28], v0, v[2:3]
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[14:15]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v25
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v26
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s8
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[27:28], v14, v[2:3]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s8
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1]
; GFX9-G-O0-NEXT: ; kill: def $vgpr14 killed $vgpr2 killed $exec
; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
; GFX9-G-O0-NEXT: s_mov_b32 s8, 31
@@ -2064,37 +2064,37 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: s_mov_b32 s4, 64
; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v18
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v17
-; GFX9-G-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v18
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v17
+; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1)
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v4
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v19, v4
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v19
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v21, v20, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v19, v6, v20
; GFX9-G-O0-NEXT: s_mov_b32 s6, 0
; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4
-; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v19, v6
+; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v20, v6
; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v6
+; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v20, v6
; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v19, v[21:22]
-; GFX9-G-O0-NEXT: v_lshrrev_b64 v[26:27], v19, v[23:24]
-; GFX9-G-O0-NEXT: v_lshlrev_b64 v[24:25], v5, v[21:22]
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v26
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v27
+; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v20, v[4:5]
+; GFX9-G-O0-NEXT: v_lshrrev_b64 v[22:23], v20, v[22:23]
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[24:25], v19, v[4:5]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v22
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v23
; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v24
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v25
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v25
; GFX9-G-O0-NEXT: v_or_b32_e64 v20, v20, v23
-; GFX9-G-O0-NEXT: v_or_b32_e64 v19, v5, v19
+; GFX9-G-O0-NEXT: v_or_b32_e64 v19, v19, v22
; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], 0
-; GFX9-G-O0-NEXT: v_lshrrev_b64 v[21:22], v4, v[21:22]
+; GFX9-G-O0-NEXT: v_lshrrev_b64 v[21:22], v21, v[4:5]
; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v21
; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v22
; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v20, s[4:5]
@@ -3120,45 +3120,45 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(9)
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[21:22]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v12, v[19:20]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v7
; GFX9-O0-NEXT: s_mov_b32 s6, 64
-; GFX9-O0-NEXT: v_sub_u32_e64 v12, s6, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], v12, v[19:20]
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v24
-; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v12
+; GFX9-O0-NEXT: v_sub_u32_e64 v21, s6, v12
+; GFX9-O0-NEXT: v_lshlrev_b64 v[21:22], v21, v[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v23, v22
+; GFX9-O0-NEXT: v_or_b32_e64 v15, v15, v23
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v23
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21
; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7
-; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v4, s6
-; GFX9-O0-NEXT: v_sub_u32_e64 v5, v4, s6
-; GFX9-O0-NEXT: v_lshrrev_b64 v[23:24], v5, v[19:20]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15
+; GFX9-O0-NEXT: v_mov_b32_e32 v23, v7
+; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v12, s6
+; GFX9-O0-NEXT: v_sub_u32_e64 v15, v12, s6
+; GFX9-O0-NEXT: v_lshrrev_b64 v[21:22], v15, v[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v22
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v15, v15, v23, s[4:5]
; GFX9-O0-NEXT: s_mov_b32 s6, 0
-; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v22
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7]
+; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v12, s6
+; GFX9-O0-NEXT: v_mov_b32_e32 v23, v20
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v15, v15, v23, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v23
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v21
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v19
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7]
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[19:20]
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15
+; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v12, v[4:5]
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
; GFX9-O0-NEXT: s_mov_b32 s8, s7
@@ -3241,8 +3241,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -3252,21 +3252,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
; GFX9-O0-NEXT: s_mov_b32 s4, s7
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
; GFX9-O0-NEXT: s_mov_b32 s8, s6
; GFX9-O0-NEXT: s_mov_b32 s9, s7
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-O0-NEXT: v_add_co_u32_e32 v9, vcc, v4, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v5, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s5
+; GFX9-O0-NEXT: v_add_co_u32_e32 v9, vcc, v8, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s4
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v6, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s8
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s9
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v6, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
@@ -3274,7 +3274,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
@@ -3286,12 +3286,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f
-; GFX9-O0-NEXT: v_sub_u32_e64 v3, s4, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v3, v[11:12]
+; GFX9-O0-NEXT: v_sub_u32_e64 v7, s4, v8
+; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v7, v[11:12]
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6
; GFX9-O0-NEXT: s_mov_b32 s4, 64
-; GFX9-O0-NEXT: v_sub_u32_e64 v14, s4, v3
-; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v14, v[7:8]
+; GFX9-O0-NEXT: v_sub_u32_e64 v14, s4, v7
+; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v14, v[3:4]
; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15
; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v16
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
@@ -3300,16 +3300,16 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6
-; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v3, s4
+; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v7, s4
; GFX9-O0-NEXT: s_mov_b32 s10, 63
-; GFX9-O0-NEXT: v_sub_u32_e64 v4, s10, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v4, v[7:8]
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[4:5]
+; GFX9-O0-NEXT: v_sub_u32_e64 v8, s10, v8
+; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v8, v[3:4]
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v15, s[4:5]
; GFX9-O0-NEXT: s_mov_b32 s10, 0
-; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v3, s10
+; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v7, s10
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[10:11]
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v15, s[10:11]
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13
; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5]
@@ -3318,8 +3318,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr10
; GFX9-O0-NEXT: ; implicit-def: $sgpr10
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[7:8], v3, v[7:8]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT: v_lshlrev_b64 v[7:8], v7, v[3:4]
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s9
; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5]
@@ -3872,13 +3872,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4
; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v7
; GFX9-G-O0-NEXT: s_mov_b32 s4, 1
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-G-O0-NEXT: v_lshlrev_b64 v[10:11], v0, v[2:3]
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[4:5]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[10:11], v4, v[2:3]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr2 killed $exec
; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
; GFX9-G-O0-NEXT: s_mov_b32 s4, 31
@@ -3979,13 +3979,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16)
; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2
; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5
; GFX9-G-O0-NEXT: s_mov_b32 s8, 1
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-G-O0-NEXT: v_lshlrev_b64 v[21:22], v2, v[0:1]
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-G-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[3:4]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s8
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[21:22], v4, v[0:1]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s8
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[4:5], v4, v[2:3]
; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec
; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec
; GFX9-G-O0-NEXT: s_mov_b32 s9, 31
@@ -4013,12 +4013,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v0, v1
; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12
; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-G-O0-NEXT: v_lshlrev_b64 v[23:24], v0, v[2:3]
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[12:13]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v14
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v15
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s8
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[23:24], v12, v[2:3]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s8
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v12, v[0:1]
; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr2 killed $exec
; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
; GFX9-G-O0-NEXT: s_mov_b32 s8, 31
@@ -4186,7 +4186,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
@@ -4195,28 +4195,28 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v5
; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v4
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v7
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v6
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v13, v4
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v21, v14, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v13, v6, v14
; GFX9-G-O0-NEXT: s_mov_b32 s6, 0
; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4
-; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v13, v6
+; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v14, v6
; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v13, v6
-; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v13, v[21:22]
-; GFX9-G-O0-NEXT: v_lshrrev_b64 v[26:27], v13, v[15:16]
-; GFX9-G-O0-NEXT: v_lshlrev_b64 v[24:25], v5, v[21:22]
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v26
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v27
+; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v14, v6
+; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v14, v[4:5]
+; GFX9-G-O0-NEXT: v_lshrrev_b64 v[22:23], v14, v[15:16]
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[24:25], v13, v[4:5]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v22
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v23
; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v24
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v25
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v25
; GFX9-G-O0-NEXT: v_or_b32_e64 v14, v14, v23
-; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v5, v13
+; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v13, v22
; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], 0
-; GFX9-G-O0-NEXT: v_lshrrev_b64 v[21:22], v4, v[21:22]
+; GFX9-G-O0-NEXT: v_lshrrev_b64 v[21:22], v21, v[4:5]
; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v21
; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v22
; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[4:5]
@@ -4555,18 +4555,18 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v4, v5
; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v1, s[6:7], v1, v0, s[6:7]
; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v2, v0, s[6:7]
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v0, s[6:7]
; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s5
; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v4, s[6:7], v3, v0, s[6:7]
-; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4
; GFX9-G-O0-NEXT: s_mov_b32 s5, 1
; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s5
; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v0, v0, v1
; GFX9-G-O0-NEXT: s_mov_b32 s5, 0
; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v2, v[5:6]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v5, v[2:3]
; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5
; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6
; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index b56860991b1948..05828cf7271ddc 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -3254,31 +3254,31 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_agent_monotonic_monotonic_cmpxchg:
@@ -3305,121 +3305,121 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -3482,16 +3482,16 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -3500,16 +3500,16 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX10-CU-LABEL: global_agent_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -3540,16 +3540,16 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -3557,16 +3557,16 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -3574,16 +3574,16 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -3591,16 +3591,16 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -3608,16 +3608,16 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX11-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -3626,16 +3626,16 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX11-CU-LABEL: global_agent_acquire_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -3644,16 +3644,16 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -3661,16 +3661,16 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -3733,35 +3733,35 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_agent_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_agent_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_agent_release_monotonic_cmpxchg:
@@ -3789,141 +3789,141 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_agent_release_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_release_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_agent_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_agent_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -3988,18 +3988,18 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -4008,18 +4008,18 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -4051,17 +4051,17 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -4069,17 +4069,17 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -4087,18 +4087,18 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -4106,18 +4106,18 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -4125,18 +4125,18 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX11-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -4145,18 +4145,18 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX11-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -4165,21 +4165,21 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -4187,21 +4187,21 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -4268,18 +4268,18 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -4288,18 +4288,18 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -4331,17 +4331,17 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -4349,17 +4349,17 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -4367,18 +4367,18 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -4386,18 +4386,18 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -4405,18 +4405,18 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX11-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -4425,18 +4425,18 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX11-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -4445,21 +4445,21 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -4467,21 +4467,21 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -4546,16 +4546,16 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_agent_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -4564,16 +4564,16 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX10-CU-LABEL: global_agent_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -4604,16 +4604,16 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -4621,16 +4621,16 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -4638,16 +4638,16 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -4655,16 +4655,16 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -4672,16 +4672,16 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_agent_monotonic_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -4690,16 +4690,16 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX11-CU-LABEL: global_agent_monotonic_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -4708,16 +4708,16 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -4725,16 +4725,16 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -4799,16 +4799,16 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_agent_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -4817,16 +4817,16 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX10-CU-LABEL: global_agent_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -4857,16 +4857,16 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -4874,16 +4874,16 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -4891,16 +4891,16 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -4908,16 +4908,16 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -4925,16 +4925,16 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_agent_acquire_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -4943,16 +4943,16 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX11-CU-LABEL: global_agent_acquire_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -4961,16 +4961,16 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -4978,16 +4978,16 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -5054,18 +5054,18 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_agent_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -5074,18 +5074,18 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX10-CU-LABEL: global_agent_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -5117,17 +5117,17 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -5135,17 +5135,17 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -5153,18 +5153,18 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_release_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -5172,18 +5172,18 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_release_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -5191,18 +5191,18 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_agent_release_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -5211,18 +5211,18 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX11-CU-LABEL: global_agent_release_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -5231,21 +5231,21 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -5253,21 +5253,21 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -5334,18 +5334,18 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -5354,18 +5354,18 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX10-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -5397,17 +5397,17 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -5415,17 +5415,17 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -5433,18 +5433,18 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -5452,18 +5452,18 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -5471,18 +5471,18 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -5491,18 +5491,18 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX11-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -5511,21 +5511,21 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -5533,21 +5533,21 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -5614,18 +5614,18 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -5634,18 +5634,18 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX10-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -5677,17 +5677,17 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -5695,17 +5695,17 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -5713,18 +5713,18 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -5732,18 +5732,18 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -5751,18 +5751,18 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -5771,18 +5771,18 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX11-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -5791,21 +5791,21 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -5813,21 +5813,21 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -5894,18 +5894,18 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -5914,18 +5914,18 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -5957,17 +5957,17 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -5975,17 +5975,17 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -5993,18 +5993,18 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -6012,18 +6012,18 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -6031,18 +6031,18 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -6051,18 +6051,18 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -6071,21 +6071,21 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -6093,21 +6093,21 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -6174,18 +6174,18 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_agent_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -6194,18 +6194,18 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -6237,17 +6237,17 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -6255,17 +6255,17 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -6273,18 +6273,18 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -6292,18 +6292,18 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -6311,18 +6311,18 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_agent_acquire_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -6331,18 +6331,18 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -6351,21 +6351,21 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -6373,21 +6373,21 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -6454,18 +6454,18 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_agent_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -6474,18 +6474,18 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_agent_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -6517,17 +6517,17 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -6535,17 +6535,17 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -6553,18 +6553,18 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -6572,18 +6572,18 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -6591,18 +6591,18 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_agent_release_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -6611,18 +6611,18 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_agent_release_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -6631,21 +6631,21 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -6653,21 +6653,21 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -6734,18 +6734,18 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -6754,18 +6754,18 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -6797,17 +6797,17 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -6815,17 +6815,17 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -6833,18 +6833,18 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -6852,18 +6852,18 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -6871,18 +6871,18 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -6891,18 +6891,18 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -6911,21 +6911,21 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -6933,21 +6933,21 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -7014,18 +7014,18 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -7034,18 +7034,18 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -7077,17 +7077,17 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -7095,17 +7095,17 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -7113,18 +7113,18 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -7132,18 +7132,18 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -7151,18 +7151,18 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -7171,18 +7171,18 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -7191,21 +7191,21 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -7213,21 +7213,21 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -14874,31 +14874,31 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
@@ -14925,121 +14925,121 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -15102,16 +15102,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -15120,16 +15120,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -15160,16 +15160,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -15177,16 +15177,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -15194,16 +15194,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -15211,16 +15211,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -15228,16 +15228,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX11-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -15246,16 +15246,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX11-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -15264,16 +15264,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -15281,16 +15281,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -15353,35 +15353,35 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
@@ -15409,141 +15409,141 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -15608,18 +15608,18 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -15628,18 +15628,18 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -15671,17 +15671,17 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -15689,17 +15689,17 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
-; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
-; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -15707,18 +15707,18 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -15726,18 +15726,18 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -15745,18 +15745,18 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -15765,18 +15765,18 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX11-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -15785,21 +15785,21 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -15807,21 +15807,21 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -15888,18 +15888,18 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -15908,18 +15908,18 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -15951,17 +15951,17 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -15969,17 +15969,17 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -15987,18 +15987,18 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -16006,18 +16006,18 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -16025,18 +16025,18 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -16045,18 +16045,18 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX11-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -16065,21 +16065,21 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -16087,21 +16087,21 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -16166,16 +16166,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -16184,16 +16184,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX10-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -16224,16 +16224,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -16241,16 +16241,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -16258,16 +16258,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -16275,16 +16275,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -16292,16 +16292,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -16310,16 +16310,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX11-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -16328,16 +16328,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -16345,16 +16345,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -16419,16 +16419,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -16437,16 +16437,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -16477,16 +16477,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -16494,16 +16494,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -16511,16 +16511,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -16528,16 +16528,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -16545,16 +16545,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -16563,16 +16563,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX11-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -16581,16 +16581,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -16598,16 +16598,16 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -16674,18 +16674,18 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -16694,18 +16694,18 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX10-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -16737,17 +16737,17 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -16755,17 +16755,17 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -16773,18 +16773,18 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -16792,18 +16792,18 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -16811,18 +16811,18 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -16831,18 +16831,18 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX11-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -16851,21 +16851,21 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -16873,21 +16873,21 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -16954,18 +16954,18 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -16974,18 +16974,18 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -17017,17 +17017,17 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -17035,17 +17035,17 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -17053,18 +17053,18 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -17072,18 +17072,18 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -17091,18 +17091,18 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -17111,18 +17111,18 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX11-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -17131,21 +17131,21 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -17153,21 +17153,21 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -17234,18 +17234,18 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -17254,18 +17254,18 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -17297,17 +17297,17 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -17315,17 +17315,17 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -17333,18 +17333,18 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -17352,18 +17352,18 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -17371,18 +17371,18 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -17391,18 +17391,18 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX11-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -17411,21 +17411,21 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -17433,21 +17433,21 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -17514,18 +17514,18 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -17534,18 +17534,18 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -17577,17 +17577,17 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -17595,17 +17595,17 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -17613,18 +17613,18 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -17632,18 +17632,18 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -17651,18 +17651,18 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -17671,18 +17671,18 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -17691,21 +17691,21 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -17713,21 +17713,21 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -17794,18 +17794,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -17814,18 +17814,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -17857,17 +17857,17 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -17875,17 +17875,17 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -17893,18 +17893,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -17912,18 +17912,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -17931,18 +17931,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -17951,18 +17951,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -17971,21 +17971,21 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -17993,21 +17993,21 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -18074,18 +18074,18 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -18094,18 +18094,18 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -18137,17 +18137,17 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -18155,17 +18155,17 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -18173,18 +18173,18 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -18192,18 +18192,18 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -18211,18 +18211,18 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -18231,18 +18231,18 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -18251,21 +18251,21 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -18273,21 +18273,21 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -18354,18 +18354,18 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -18374,18 +18374,18 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -18417,17 +18417,17 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -18435,17 +18435,17 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -18453,18 +18453,18 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -18472,18 +18472,18 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -18491,18 +18491,18 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -18511,18 +18511,18 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -18531,21 +18531,21 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -18553,21 +18553,21 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
@@ -18634,18 +18634,18 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -18654,18 +18654,18 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -18697,17 +18697,17 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
@@ -18715,17 +18715,17 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -18733,18 +18733,18 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -18752,18 +18752,18 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -18771,18 +18771,18 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -18791,18 +18791,18 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -18811,21 +18811,21 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_endpgm
@@ -18833,21 +18833,21 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 9b2b3a4cfa9bae..9667685a8fec1f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -685,15 +685,15 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX6-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11]
; GFX6-NEXT: s_mov_b32 s9, 2
-; GFX6-NEXT: v_lshlrev_b32_e64 v1, s9, v0
+; GFX6-NEXT: v_lshlrev_b32_e64 v0, s9, v0
; GFX6-NEXT: s_mov_b32 s9, 0
; GFX6-NEXT: ; implicit-def: $sgpr9
-; GFX6-NEXT: v_mov_b32_e32 v0, 0
-; GFX6-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX6-NEXT: v_mov_b32_e32 v2, v0
+; GFX6-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s8
-; GFX6-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 glc slc
+; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 glc slc
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_nontemporal_store_1:
@@ -767,15 +767,15 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[6:7]
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 2
-; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s5, v0
+; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v0, s5, v0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0
; SKIP-CACHE-INV-NEXT: ; implicit-def: $sgpr5
-; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0
-; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, 0
+; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 glc slc
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 glc slc
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_store_1:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index afc46fbc23a67a..37cf3d8035d4f4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -2863,31 +2863,31 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
@@ -2914,121 +2914,121 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -3087,31 +3087,31 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
@@ -3138,121 +3138,121 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -3311,31 +3311,31 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_release_monotonic_cmpxchg:
@@ -3362,121 +3362,121 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_release_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -3535,31 +3535,31 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
@@ -3586,121 +3586,121 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -3759,31 +3759,31 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
@@ -3810,121 +3810,121 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -3983,31 +3983,31 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
@@ -4034,121 +4034,121 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -4207,31 +4207,31 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_acquire_cmpxchg:
@@ -4258,121 +4258,121 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -4431,31 +4431,31 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_release_acquire_cmpxchg:
@@ -4482,121 +4482,121 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_release_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_release_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -4655,31 +4655,31 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
@@ -4706,121 +4706,121 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -4879,31 +4879,31 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
@@ -4930,121 +4930,121 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -5103,31 +5103,31 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
@@ -5154,121 +5154,121 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -5327,31 +5327,31 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
@@ -5378,121 +5378,121 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -5551,31 +5551,31 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_release_seq_cst_cmpxchg:
@@ -5602,121 +5602,121 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -5775,31 +5775,31 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
@@ -5826,121 +5826,121 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -5999,31 +5999,31 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
@@ -6050,121 +6050,121 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -12848,31 +12848,31 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
@@ -12899,121 +12899,121 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -13072,31 +13072,31 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
@@ -13123,121 +13123,121 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -13296,31 +13296,31 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
@@ -13347,121 +13347,121 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -13520,31 +13520,31 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
@@ -13571,121 +13571,121 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
-; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
-; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -13744,31 +13744,31 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
@@ -13795,121 +13795,121 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -13968,31 +13968,31 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
@@ -14019,121 +14019,121 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -14192,31 +14192,31 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
@@ -14243,121 +14243,121 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -14416,31 +14416,31 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
@@ -14467,121 +14467,121 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -14640,31 +14640,31 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
@@ -14691,121 +14691,121 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -14864,31 +14864,31 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
@@ -14915,121 +14915,121 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -15088,31 +15088,31 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
@@ -15139,121 +15139,121 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -15312,31 +15312,31 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
@@ -15363,121 +15363,121 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -15536,31 +15536,31 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
@@ -15587,121 +15587,121 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -15760,31 +15760,31 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
@@ -15811,121 +15811,121 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -15984,31 +15984,31 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
@@ -16035,121 +16035,121 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index 62a4f3b43b2dcd..2a43d063e19536 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -3284,31 +3284,31 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_monotonic_monotonic_cmpxchg:
@@ -3335,121 +3335,121 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -3512,16 +3512,16 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -3530,16 +3530,16 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX10-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -3570,16 +3570,16 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -3588,16 +3588,16 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -3606,16 +3606,16 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -3623,16 +3623,16 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -3640,16 +3640,16 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX11-WGP-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -3658,16 +3658,16 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX11-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -3676,16 +3676,16 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -3693,16 +3693,16 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -3765,35 +3765,35 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_system_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_release_monotonic_cmpxchg:
@@ -3821,143 +3821,143 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_system_release_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_system_release_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_release_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_release_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -4022,18 +4022,18 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -4042,18 +4042,18 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX10-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -4085,18 +4085,18 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -4105,18 +4105,18 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -4125,18 +4125,18 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -4144,18 +4144,18 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -4163,18 +4163,18 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX11-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -4183,18 +4183,18 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX11-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -4203,21 +4203,21 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -4225,21 +4225,21 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -4306,18 +4306,18 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -4326,18 +4326,18 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX10-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -4369,18 +4369,18 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -4389,18 +4389,18 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -4409,18 +4409,18 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -4428,18 +4428,18 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -4447,18 +4447,18 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX11-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -4467,18 +4467,18 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX11-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -4487,21 +4487,21 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -4509,21 +4509,21 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -4588,16 +4588,16 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -4606,16 +4606,16 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX10-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -4646,16 +4646,16 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -4664,16 +4664,16 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -4682,16 +4682,16 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -4699,16 +4699,16 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -4716,16 +4716,16 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -4734,16 +4734,16 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX11-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -4752,16 +4752,16 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -4769,16 +4769,16 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -4843,16 +4843,16 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -4861,16 +4861,16 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX10-CU-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -4901,16 +4901,16 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -4919,16 +4919,16 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -4937,16 +4937,16 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -4954,16 +4954,16 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -4971,16 +4971,16 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -4989,16 +4989,16 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX11-CU-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -5007,16 +5007,16 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -5024,16 +5024,16 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -5100,18 +5100,18 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_system_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -5120,18 +5120,18 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX10-CU-LABEL: global_system_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -5163,18 +5163,18 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -5183,18 +5183,18 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -5203,18 +5203,18 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_release_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -5222,18 +5222,18 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_release_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -5241,18 +5241,18 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_system_release_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -5261,18 +5261,18 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX11-CU-LABEL: global_system_release_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -5281,21 +5281,21 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -5303,21 +5303,21 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -5384,18 +5384,18 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -5404,18 +5404,18 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX10-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -5447,18 +5447,18 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -5467,18 +5467,18 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -5487,18 +5487,18 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -5506,18 +5506,18 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -5525,18 +5525,18 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -5545,18 +5545,18 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX11-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -5565,21 +5565,21 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -5587,21 +5587,21 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -5668,18 +5668,18 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -5688,18 +5688,18 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX10-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -5731,18 +5731,18 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -5751,18 +5751,18 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -5771,18 +5771,18 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -5790,18 +5790,18 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -5809,18 +5809,18 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -5829,18 +5829,18 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX11-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -5849,21 +5849,21 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -5871,21 +5871,21 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -5952,18 +5952,18 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -5972,18 +5972,18 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -6015,18 +6015,18 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -6035,18 +6035,18 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -6055,18 +6055,18 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -6074,18 +6074,18 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -6093,18 +6093,18 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -6113,18 +6113,18 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -6133,21 +6133,21 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -6155,21 +6155,21 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -13610,31 +13610,31 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
@@ -13661,121 +13661,121 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -13838,16 +13838,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -13856,16 +13856,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -13896,16 +13896,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -13914,16 +13914,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -13932,16 +13932,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -13949,16 +13949,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -13966,16 +13966,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX11-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -13984,16 +13984,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX11-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -14002,16 +14002,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -14019,16 +14019,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -14091,35 +14091,35 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_release_monotonic_cmpxchg:
@@ -14147,143 +14147,143 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -14348,18 +14348,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -14368,18 +14368,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -14411,18 +14411,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -14431,18 +14431,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
-; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
-; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -14451,18 +14451,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -14470,18 +14470,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -14489,18 +14489,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX11-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -14509,18 +14509,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX11-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -14529,21 +14529,21 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -14551,21 +14551,21 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -14632,18 +14632,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -14652,18 +14652,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -14695,18 +14695,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -14715,18 +14715,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -14735,18 +14735,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -14754,18 +14754,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -14773,18 +14773,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX11-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -14793,18 +14793,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX11-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -14813,21 +14813,21 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -14835,21 +14835,21 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -14914,16 +14914,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -14932,16 +14932,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX10-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -14972,16 +14972,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -14990,16 +14990,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -15008,16 +15008,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -15025,16 +15025,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -15042,16 +15042,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -15060,16 +15060,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX11-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -15078,16 +15078,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -15095,16 +15095,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -15169,16 +15169,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -15187,16 +15187,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -15227,16 +15227,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -15245,16 +15245,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -15263,16 +15263,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -15280,16 +15280,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -15297,16 +15297,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -15315,16 +15315,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX11-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -15333,16 +15333,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -15350,16 +15350,16 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -15426,18 +15426,18 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -15446,18 +15446,18 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX10-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -15489,18 +15489,18 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -15509,18 +15509,18 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -15529,18 +15529,18 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -15548,18 +15548,18 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -15567,18 +15567,18 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -15587,18 +15587,18 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX11-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -15607,21 +15607,21 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -15629,21 +15629,21 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -15710,18 +15710,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -15730,18 +15730,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -15773,18 +15773,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -15793,18 +15793,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -15813,18 +15813,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -15832,18 +15832,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -15851,18 +15851,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -15871,18 +15871,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX11-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -15891,21 +15891,21 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -15913,21 +15913,21 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -15994,18 +15994,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -16014,18 +16014,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -16057,18 +16057,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -16077,18 +16077,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -16097,18 +16097,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -16116,18 +16116,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -16135,18 +16135,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -16155,18 +16155,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX11-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -16175,21 +16175,21 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -16197,21 +16197,21 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -16278,18 +16278,18 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -16298,18 +16298,18 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -16341,18 +16341,18 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -16361,18 +16361,18 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -16381,18 +16381,18 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -16400,18 +16400,18 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -16419,18 +16419,18 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -16439,18 +16439,18 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -16459,21 +16459,21 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -16481,21 +16481,21 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -16562,18 +16562,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -16582,18 +16582,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -16625,18 +16625,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -16645,18 +16645,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -16665,18 +16665,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -16684,18 +16684,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -16703,18 +16703,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -16723,18 +16723,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -16743,21 +16743,21 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -16765,21 +16765,21 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -16846,18 +16846,18 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -16866,18 +16866,18 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -16909,18 +16909,18 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -16929,18 +16929,18 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -16949,18 +16949,18 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -16968,18 +16968,18 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -16987,18 +16987,18 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -17007,18 +17007,18 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -17027,21 +17027,21 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -17049,21 +17049,21 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -17130,18 +17130,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -17150,18 +17150,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -17193,18 +17193,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -17213,18 +17213,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -17233,18 +17233,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -17252,18 +17252,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -17271,18 +17271,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -17291,18 +17291,18 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -17311,21 +17311,21 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -17333,21 +17333,21 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
@@ -17414,18 +17414,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
@@ -17434,18 +17434,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
@@ -17477,18 +17477,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -17497,18 +17497,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
@@ -17517,18 +17517,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
@@ -17536,18 +17536,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -17555,18 +17555,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
@@ -17575,18 +17575,18 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
@@ -17595,21 +17595,21 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
@@ -17617,21 +17617,21 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index a98efb49b4b72b..438535d7667523 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -530,15 +530,15 @@ define amdgpu_kernel void @global_volatile_store_1(
; GFX6-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_mov_b32 s5, 2
-; GFX6-NEXT: v_lshlrev_b32_e64 v1, s5, v0
+; GFX6-NEXT: v_lshlrev_b32_e64 v0, s5, v0
; GFX6-NEXT: s_mov_b32 s5, 0
; GFX6-NEXT: ; implicit-def: $sgpr5
-; GFX6-NEXT: v_mov_b32_e32 v0, 0
-; GFX6-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX6-NEXT: v_mov_b32_e32 v2, v0
+; GFX6-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
-; GFX6-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_endpgm
;
@@ -616,15 +616,15 @@ define amdgpu_kernel void @global_volatile_store_1(
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[6:7]
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 2
-; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s5, v0
+; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v0, s5, v0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0
; SKIP-CACHE-INV-NEXT: ; implicit-def: $sgpr5
-; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0
-; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, 0
+; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index f805e2cf37006c..e5787150741f48 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -2863,31 +2863,31 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
@@ -2914,121 +2914,121 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -3087,31 +3087,31 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
@@ -3138,121 +3138,121 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -3311,31 +3311,31 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_release_monotonic_cmpxchg:
@@ -3362,121 +3362,121 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_release_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -3535,31 +3535,31 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
@@ -3586,121 +3586,121 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -3759,31 +3759,31 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
@@ -3810,121 +3810,121 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -3983,31 +3983,31 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
@@ -4034,121 +4034,121 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -4207,31 +4207,31 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_acquire_cmpxchg:
@@ -4258,121 +4258,121 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -4431,31 +4431,31 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_release_acquire_cmpxchg:
@@ -4482,121 +4482,121 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_release_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_release_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -4655,31 +4655,31 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
@@ -4706,121 +4706,121 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -4879,31 +4879,31 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
@@ -4930,121 +4930,121 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -5103,31 +5103,31 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
@@ -5154,121 +5154,121 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -5327,31 +5327,31 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
@@ -5378,121 +5378,121 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -5551,31 +5551,31 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_release_seq_cst_cmpxchg:
@@ -5602,121 +5602,121 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -5775,31 +5775,31 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
@@ -5826,121 +5826,121 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -5999,31 +5999,31 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
@@ -6050,121 +6050,121 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -12848,31 +12848,31 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
@@ -12899,121 +12899,121 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -13072,31 +13072,31 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
@@ -13123,121 +13123,121 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -13296,31 +13296,31 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
@@ -13347,121 +13347,121 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -13520,31 +13520,31 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
@@ -13571,121 +13571,121 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
-; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
-; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -13744,31 +13744,31 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
@@ -13795,121 +13795,121 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -13968,31 +13968,31 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
@@ -14019,121 +14019,121 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -14192,31 +14192,31 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
@@ -14243,121 +14243,121 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -14416,31 +14416,31 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
@@ -14467,121 +14467,121 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -14640,31 +14640,31 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
@@ -14691,121 +14691,121 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -14864,31 +14864,31 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
@@ -14915,121 +14915,121 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -15088,31 +15088,31 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
@@ -15139,121 +15139,121 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -15312,31 +15312,31 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
@@ -15363,121 +15363,121 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -15536,31 +15536,31 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
@@ -15587,121 +15587,121 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -15760,31 +15760,31 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
@@ -15811,121 +15811,121 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -15984,31 +15984,31 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
@@ -16035,121 +16035,121 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 30bf4920715352..e45a5d767f82d2 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -3071,31 +3071,31 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
@@ -3122,121 +3122,121 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -3295,16 +3295,16 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -3312,16 +3312,16 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
@@ -3348,31 +3348,31 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -3380,31 +3380,31 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -3412,16 +3412,16 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -3429,31 +3429,31 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -3461,16 +3461,16 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -3531,34 +3531,34 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_workgroup_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_release_monotonic_cmpxchg:
@@ -3586,134 +3586,134 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_workgroup_release_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_workgroup_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: s_wait_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -3774,18 +3774,18 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -3793,17 +3793,17 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
@@ -3831,33 +3831,33 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -3865,33 +3865,33 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -3899,18 +3899,18 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -3918,37 +3918,37 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -3956,17 +3956,17 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: s_wait_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -4027,18 +4027,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -4046,17 +4046,17 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4084,33 +4084,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -4118,33 +4118,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -4152,18 +4152,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -4171,37 +4171,37 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -4209,17 +4209,17 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: s_wait_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -4278,16 +4278,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -4295,16 +4295,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
@@ -4331,31 +4331,31 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -4363,31 +4363,31 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -4395,16 +4395,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -4412,31 +4412,31 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -4444,16 +4444,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -4512,16 +4512,16 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -4529,16 +4529,16 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_acquire_cmpxchg:
@@ -4565,31 +4565,31 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -4597,31 +4597,31 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -4629,16 +4629,16 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -4646,31 +4646,31 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -4678,16 +4678,16 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -4748,18 +4748,18 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -4767,17 +4767,17 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_cmpxchg:
@@ -4805,33 +4805,33 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -4839,33 +4839,33 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -4873,18 +4873,18 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_release_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -4892,37 +4892,37 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_release_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -4930,17 +4930,17 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: s_wait_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -5001,18 +5001,18 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -5020,17 +5020,17 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
@@ -5058,33 +5058,33 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -5092,33 +5092,33 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -5126,18 +5126,18 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -5145,37 +5145,37 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -5183,17 +5183,17 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: s_wait_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -5254,18 +5254,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -5273,17 +5273,17 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
@@ -5311,33 +5311,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -5345,33 +5345,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -5379,18 +5379,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -5398,37 +5398,37 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -5436,17 +5436,17 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: s_wait_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -5507,18 +5507,18 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -5526,17 +5526,17 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
@@ -5564,33 +5564,33 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -5598,33 +5598,33 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -5632,18 +5632,18 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -5651,37 +5651,37 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -5689,17 +5689,17 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: s_wait_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -5760,18 +5760,18 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -5779,17 +5779,17 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
@@ -5817,33 +5817,33 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -5851,33 +5851,33 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -5885,18 +5885,18 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -5904,37 +5904,37 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -5942,17 +5942,17 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: s_wait_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -6013,18 +6013,18 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -6032,17 +6032,17 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_release_seq_cst_cmpxchg:
@@ -6070,33 +6070,33 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -6104,33 +6104,33 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -6138,18 +6138,18 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -6157,37 +6157,37 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -6195,17 +6195,17 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: s_wait_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -6266,18 +6266,18 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -6285,17 +6285,17 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
@@ -6323,33 +6323,33 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -6357,33 +6357,33 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -6391,18 +6391,18 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -6410,37 +6410,37 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -6448,17 +6448,17 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: s_wait_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -6519,18 +6519,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -6538,17 +6538,17 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6576,33 +6576,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -6610,33 +6610,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -6644,18 +6644,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -6663,37 +6663,37 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -6701,17 +6701,17 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
; GFX12-CU-NEXT: s_wait_dscnt 0x0
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -13843,31 +13843,31 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
@@ -13894,121 +13894,121 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -14067,16 +14067,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -14084,16 +14084,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
@@ -14120,31 +14120,31 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -14152,31 +14152,31 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -14184,16 +14184,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -14201,31 +14201,31 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -14233,16 +14233,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -14301,33 +14301,33 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
@@ -14354,130 +14354,130 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -14536,18 +14536,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -14555,16 +14555,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -14591,32 +14591,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
-; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
-; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -14624,32 +14624,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -14657,18 +14657,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -14676,36 +14676,36 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -14713,16 +14713,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -14781,18 +14781,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -14800,16 +14800,16 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -14836,32 +14836,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -14869,32 +14869,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -14902,18 +14902,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -14921,36 +14921,36 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -14958,16 +14958,16 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -15026,16 +15026,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -15043,16 +15043,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
@@ -15079,31 +15079,31 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -15111,31 +15111,31 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -15143,16 +15143,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -15160,31 +15160,31 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -15192,16 +15192,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -15260,16 +15260,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -15277,16 +15277,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
@@ -15313,31 +15313,31 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -15345,31 +15345,31 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -15377,16 +15377,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -15394,31 +15394,31 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -15426,16 +15426,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -15494,18 +15494,18 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -15513,16 +15513,16 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
@@ -15549,32 +15549,32 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -15582,32 +15582,32 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -15615,18 +15615,18 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -15634,36 +15634,36 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -15671,16 +15671,16 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -15739,18 +15739,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -15758,16 +15758,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -15794,32 +15794,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -15827,32 +15827,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -15860,18 +15860,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -15879,36 +15879,36 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -15916,16 +15916,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -15984,18 +15984,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -16003,16 +16003,16 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16039,32 +16039,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -16072,32 +16072,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -16105,18 +16105,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -16124,36 +16124,36 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -16161,16 +16161,16 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -16229,18 +16229,18 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -16248,16 +16248,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
@@ -16284,32 +16284,32 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -16317,32 +16317,32 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -16350,18 +16350,18 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -16369,36 +16369,36 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -16406,16 +16406,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -16474,18 +16474,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -16493,16 +16493,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
@@ -16529,32 +16529,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -16562,32 +16562,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -16595,18 +16595,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -16614,36 +16614,36 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -16651,16 +16651,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -16719,18 +16719,18 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -16738,16 +16738,16 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -16774,32 +16774,32 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -16807,32 +16807,32 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -16840,18 +16840,18 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -16859,36 +16859,36 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -16896,16 +16896,16 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -16964,18 +16964,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -16983,16 +16983,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17019,32 +17019,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -17052,32 +17052,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -17085,18 +17085,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -17104,36 +17104,36 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -17141,16 +17141,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
@@ -17209,18 +17209,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
@@ -17228,16 +17228,16 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
-; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -17264,32 +17264,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s7
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
@@ -17297,32 +17297,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
-; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s3
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s2
+; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
@@ -17330,18 +17330,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
@@ -17349,36 +17349,36 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
-; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
@@ -17386,16 +17386,16 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
-; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll b/llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll
index d1469ed6c67432..3598d43c6c5385 100644
--- a/llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll
@@ -8,16 +8,16 @@ define amdgpu_kernel void @test_long_add4(<4 x i64> %arg) #0 {
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v1, s5
-; CHECK-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
+; CHECK-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:16
-; CHECK-NEXT: ; kill: def $vgpr6_vgpr7_vgpr8_vgpr9 killed $vgpr6_vgpr7_vgpr8_vgpr9 def $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13 killed $exec
+; CHECK-NEXT: ; kill: def $vgpr10_vgpr11_vgpr12_vgpr13 killed $vgpr10_vgpr11_vgpr12_vgpr13 def $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 killed $exec
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v13, v3
-; CHECK-NEXT: v_mov_b32_e32 v12, v2
-; CHECK-NEXT: v_mov_b32_e32 v11, v1
-; CHECK-NEXT: v_mov_b32_e32 v10, v0
+; CHECK-NEXT: v_mov_b32_e32 v17, v3
+; CHECK-NEXT: v_mov_b32_e32 v16, v2
+; CHECK-NEXT: v_mov_b32_e32 v15, v1
+; CHECK-NEXT: v_mov_b32_e32 v14, v0
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: global_load_dwordx4 v[18:21], v[0:1], off
@@ -30,14 +30,14 @@ define amdgpu_kernel void @test_long_add4(<4 x i64> %arg) #0 {
; CHECK-NEXT: v_mov_b32_e32 v24, v2
; CHECK-NEXT: v_mov_b32_e32 v23, v1
; CHECK-NEXT: v_mov_b32_e32 v22, v0
-; CHECK-NEXT: v_mov_b32_e32 v4, v6
-; CHECK-NEXT: v_mov_b32_e32 v5, v7
-; CHECK-NEXT: v_mov_b32_e32 v2, v8
-; CHECK-NEXT: v_mov_b32_e32 v3, v9
-; CHECK-NEXT: v_mov_b32_e32 v0, v10
-; CHECK-NEXT: v_mov_b32_e32 v1, v11
-; CHECK-NEXT: v_mov_b32_e32 v8, v12
-; CHECK-NEXT: v_mov_b32_e32 v9, v13
+; CHECK-NEXT: v_mov_b32_e32 v2, v10
+; CHECK-NEXT: v_mov_b32_e32 v3, v11
+; CHECK-NEXT: v_mov_b32_e32 v0, v12
+; CHECK-NEXT: v_mov_b32_e32 v1, v13
+; CHECK-NEXT: v_mov_b32_e32 v8, v14
+; CHECK-NEXT: v_mov_b32_e32 v9, v15
+; CHECK-NEXT: v_mov_b32_e32 v6, v16
+; CHECK-NEXT: v_mov_b32_e32 v7, v17
; CHECK-NEXT: v_mov_b32_e32 v16, v18
; CHECK-NEXT: v_mov_b32_e32 v17, v19
; CHECK-NEXT: v_mov_b32_e32 v14, v20
@@ -46,51 +46,51 @@ define amdgpu_kernel void @test_long_add4(<4 x i64> %arg) #0 {
; CHECK-NEXT: v_mov_b32_e32 v13, v23
; CHECK-NEXT: v_mov_b32_e32 v10, v24
; CHECK-NEXT: v_mov_b32_e32 v11, v25
-; CHECK-NEXT: v_mov_b32_e32 v6, v4
-; CHECK-NEXT: v_mov_b32_e32 v4, v5
-; CHECK-NEXT: v_mov_b32_e32 v7, v16
-; CHECK-NEXT: v_mov_b32_e32 v5, v17
-; CHECK-NEXT: v_add_co_u32 v6, s6, v6, v7
-; CHECK-NEXT: v_add_co_ci_u32_e64 v4, s6, v4, v5, s6
-; CHECK-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; CHECK-NEXT: v_mov_b32_e32 v7, v4
; CHECK-NEXT: v_mov_b32_e32 v4, v2
; CHECK-NEXT: v_mov_b32_e32 v2, v3
-; CHECK-NEXT: v_mov_b32_e32 v5, v14
-; CHECK-NEXT: v_mov_b32_e32 v3, v15
+; CHECK-NEXT: v_mov_b32_e32 v5, v16
+; CHECK-NEXT: v_mov_b32_e32 v3, v17
; CHECK-NEXT: v_add_co_u32 v4, s6, v4, v5
; CHECK-NEXT: v_add_co_ci_u32_e64 v2, s6, v2, v3, s6
; CHECK-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; CHECK-NEXT: v_mov_b32_e32 v5, v2
; CHECK-NEXT: v_mov_b32_e32 v2, v0
; CHECK-NEXT: v_mov_b32_e32 v0, v1
-; CHECK-NEXT: v_mov_b32_e32 v3, v12
-; CHECK-NEXT: v_mov_b32_e32 v1, v13
+; CHECK-NEXT: v_mov_b32_e32 v3, v14
+; CHECK-NEXT: v_mov_b32_e32 v1, v15
; CHECK-NEXT: v_add_co_u32 v2, s6, v2, v3
; CHECK-NEXT: v_add_co_ci_u32_e64 v0, s6, v0, v1, s6
; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; CHECK-NEXT: v_mov_b32_e32 v3, v0
; CHECK-NEXT: v_mov_b32_e32 v0, v8
; CHECK-NEXT: v_mov_b32_e32 v1, v9
-; CHECK-NEXT: v_mov_b32_e32 v9, v10
-; CHECK-NEXT: v_mov_b32_e32 v8, v11
+; CHECK-NEXT: v_mov_b32_e32 v9, v12
+; CHECK-NEXT: v_mov_b32_e32 v8, v13
; CHECK-NEXT: v_add_co_u32 v0, s6, v0, v9
; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s6, v1, v8, s6
; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; CHECK-NEXT: v_mov_b32_e32 v1, v8
-; CHECK-NEXT: ; kill: def $vgpr6_vgpr7 killed $vgpr6_vgpr7 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec
-; CHECK-NEXT: v_mov_b32_e32 v9, v5
-; CHECK-NEXT: v_mov_b32_e32 v8, v4
-; CHECK-NEXT: ; kill: def $vgpr2_vgpr3 killed $vgpr2_vgpr3 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
-; CHECK-NEXT: v_mov_b32_e32 v5, v1
-; CHECK-NEXT: v_mov_b32_e32 v4, v0
-; CHECK-NEXT: v_mov_b32_e32 v0, s4
-; CHECK-NEXT: v_mov_b32_e32 v1, s5
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off
+; CHECK-NEXT: v_mov_b32_e32 v8, v6
+; CHECK-NEXT: v_mov_b32_e32 v6, v7
+; CHECK-NEXT: v_mov_b32_e32 v9, v10
+; CHECK-NEXT: v_mov_b32_e32 v7, v11
+; CHECK-NEXT: v_add_co_u32 v8, s6, v8, v9
+; CHECK-NEXT: v_add_co_ci_u32_e64 v6, s6, v6, v7, s6
+; CHECK-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; CHECK-NEXT: v_mov_b32_e32 v9, v6
+; CHECK-NEXT: ; kill: def $vgpr4_vgpr5 killed $vgpr4_vgpr5 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
+; CHECK-NEXT: v_mov_b32_e32 v7, v3
+; CHECK-NEXT: v_mov_b32_e32 v6, v2
+; CHECK-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; CHECK-NEXT: v_mov_b32_e32 v2, v8
+; CHECK-NEXT: v_mov_b32_e32 v3, v9
+; CHECK-NEXT: v_mov_b32_e32 v9, s5
+; CHECK-NEXT: v_mov_b32_e32 v8, s4
+; CHECK-NEXT: global_store_dwordx4 v[8:9], v[4:7], off
; CHECK-NEXT: s_mov_b64 s[4:5], 16
-; CHECK-NEXT: v_mov_b32_e32 v0, s4
-; CHECK-NEXT: v_mov_b32_e32 v1, s5
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: v_mov_b32_e32 v4, s4
+; CHECK-NEXT: v_mov_b32_e32 v5, s5
+; CHECK-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; CHECK-NEXT: s_endpgm
entry:
%load0 = load <4 x i64>, ptr addrspace(1) null, align 32
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index fe093d4ac8515e..15cfe6481f0b43 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -953,45 +953,45 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(9)
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[21:22]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v12, v[19:20]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v7
; GFX9-O0-NEXT: s_mov_b32 s6, 64
-; GFX9-O0-NEXT: v_sub_u32_e64 v12, s6, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], v12, v[19:20]
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v24
-; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v12
+; GFX9-O0-NEXT: v_sub_u32_e64 v21, s6, v12
+; GFX9-O0-NEXT: v_lshlrev_b64 v[21:22], v21, v[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v23, v22
+; GFX9-O0-NEXT: v_or_b32_e64 v15, v15, v23
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v23
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21
; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7
-; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v4, s6
-; GFX9-O0-NEXT: v_sub_u32_e64 v5, v4, s6
-; GFX9-O0-NEXT: v_lshrrev_b64 v[23:24], v5, v[19:20]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15
+; GFX9-O0-NEXT: v_mov_b32_e32 v23, v7
+; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v12, s6
+; GFX9-O0-NEXT: v_sub_u32_e64 v15, v12, s6
+; GFX9-O0-NEXT: v_lshrrev_b64 v[21:22], v15, v[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v22
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v15, v15, v23, s[4:5]
; GFX9-O0-NEXT: s_mov_b32 s6, 0
-; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v22
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7]
+; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v12, s6
+; GFX9-O0-NEXT: v_mov_b32_e32 v23, v20
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v15, v15, v23, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v23
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v21
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v19
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7]
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[19:20]
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15
+; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v12, v[4:5]
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
; GFX9-O0-NEXT: s_mov_b32 s8, s7
@@ -1074,8 +1074,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -1085,21 +1085,21 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
; GFX9-O0-NEXT: s_mov_b32 s4, s7
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
; GFX9-O0-NEXT: s_mov_b32 s8, s6
; GFX9-O0-NEXT: s_mov_b32 s9, s7
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-O0-NEXT: v_add_co_u32_e32 v9, vcc, v4, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v5, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s5
+; GFX9-O0-NEXT: v_add_co_u32_e32 v9, vcc, v8, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s4
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v6, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s8
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s9
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v6, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
@@ -1107,7 +1107,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
@@ -1119,12 +1119,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f
-; GFX9-O0-NEXT: v_sub_u32_e64 v3, s4, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v3, v[11:12]
+; GFX9-O0-NEXT: v_sub_u32_e64 v7, s4, v8
+; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v7, v[11:12]
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6
; GFX9-O0-NEXT: s_mov_b32 s4, 64
-; GFX9-O0-NEXT: v_sub_u32_e64 v14, s4, v3
-; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v14, v[7:8]
+; GFX9-O0-NEXT: v_sub_u32_e64 v14, s4, v7
+; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v14, v[3:4]
; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15
; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v16
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
@@ -1133,16 +1133,16 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6
-; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v3, s4
+; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v7, s4
; GFX9-O0-NEXT: s_mov_b32 s10, 63
-; GFX9-O0-NEXT: v_sub_u32_e64 v4, s10, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v4, v[7:8]
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[4:5]
+; GFX9-O0-NEXT: v_sub_u32_e64 v8, s10, v8
+; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v8, v[3:4]
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v15, s[4:5]
; GFX9-O0-NEXT: s_mov_b32 s10, 0
-; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v3, s10
+; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v7, s10
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[10:11]
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v15, s[10:11]
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13
; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5]
@@ -1151,8 +1151,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr10
; GFX9-O0-NEXT: ; implicit-def: $sgpr10
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[7:8], v3, v[7:8]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT: v_lshlrev_b64 v[7:8], v7, v[3:4]
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s9
; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5]
@@ -2343,45 +2343,45 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(9)
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[21:22]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v12, v[19:20]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v7
; GFX9-O0-NEXT: s_mov_b32 s6, 64
-; GFX9-O0-NEXT: v_sub_u32_e64 v12, s6, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], v12, v[19:20]
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v24
-; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v12
+; GFX9-O0-NEXT: v_sub_u32_e64 v21, s6, v12
+; GFX9-O0-NEXT: v_lshlrev_b64 v[21:22], v21, v[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v23, v22
+; GFX9-O0-NEXT: v_or_b32_e64 v15, v15, v23
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v23
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21
; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7
-; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v4, s6
-; GFX9-O0-NEXT: v_sub_u32_e64 v5, v4, s6
-; GFX9-O0-NEXT: v_lshrrev_b64 v[23:24], v5, v[19:20]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15
+; GFX9-O0-NEXT: v_mov_b32_e32 v23, v7
+; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v12, s6
+; GFX9-O0-NEXT: v_sub_u32_e64 v15, v12, s6
+; GFX9-O0-NEXT: v_lshrrev_b64 v[21:22], v15, v[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v22
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v15, v15, v23, s[4:5]
; GFX9-O0-NEXT: s_mov_b32 s6, 0
-; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v22
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7]
+; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v12, s6
+; GFX9-O0-NEXT: v_mov_b32_e32 v23, v20
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v15, v15, v23, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v23
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v21
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v19
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7]
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[19:20]
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15
+; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v12, v[4:5]
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
; GFX9-O0-NEXT: s_mov_b32 s8, s7
@@ -2464,8 +2464,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -2475,21 +2475,21 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
; GFX9-O0-NEXT: s_mov_b32 s4, s7
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
; GFX9-O0-NEXT: s_mov_b32 s8, s6
; GFX9-O0-NEXT: s_mov_b32 s9, s7
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-O0-NEXT: v_add_co_u32_e32 v9, vcc, v4, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v5, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, s8
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s5
+; GFX9-O0-NEXT: v_add_co_u32_e32 v9, vcc, v8, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s4
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v6, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s8
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s9
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v6, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
@@ -2497,7 +2497,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
@@ -2509,12 +2509,12 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f
-; GFX9-O0-NEXT: v_sub_u32_e64 v3, s4, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v3, v[11:12]
+; GFX9-O0-NEXT: v_sub_u32_e64 v7, s4, v8
+; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v7, v[11:12]
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6
; GFX9-O0-NEXT: s_mov_b32 s4, 64
-; GFX9-O0-NEXT: v_sub_u32_e64 v14, s4, v3
-; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v14, v[7:8]
+; GFX9-O0-NEXT: v_sub_u32_e64 v14, s4, v7
+; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v14, v[3:4]
; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15
; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v16
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
@@ -2523,16 +2523,16 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6
-; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v3, s4
+; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v7, s4
; GFX9-O0-NEXT: s_mov_b32 s10, 63
-; GFX9-O0-NEXT: v_sub_u32_e64 v4, s10, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v4, v[7:8]
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[4:5]
+; GFX9-O0-NEXT: v_sub_u32_e64 v8, s10, v8
+; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v8, v[3:4]
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v15, s[4:5]
; GFX9-O0-NEXT: s_mov_b32 s10, 0
-; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v3, s10
+; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v7, s10
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[10:11]
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v15, s[10:11]
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13
; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5]
@@ -2541,8 +2541,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr10
; GFX9-O0-NEXT: ; implicit-def: $sgpr10
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[7:8], v3, v[7:8]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT: v_lshlrev_b64 v[7:8], v7, v[3:4]
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s9
; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
index f523b4a2495f19..3fc3d1eff49a11 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -1301,35 +1301,32 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in
; GCN-NEXT: v_writelane_b32 v5, s35, 1
; GCN-NEXT: v_writelane_b32 v5, s36, 2
; GCN-NEXT: v_writelane_b32 v5, s37, 3
-; GCN-NEXT: v_mov_b32_e32 v4, v3
-; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v4, v1
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; GCN-NEXT: ; implicit-def: $sgpr4
; GCN-NEXT: ; implicit-def: $sgpr4
-; GCN-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GCN-NEXT: v_mov_b32_e32 v1, v3
+; GCN-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GCN-NEXT: v_mov_b32_e32 v2, v4
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
; GCN-NEXT: ; implicit-def: $sgpr4
; GCN-NEXT: ; implicit-def: $sgpr4
-; GCN-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GCN-NEXT: v_mov_b32_e32 v3, v4
+; GCN-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GCN-NEXT: v_mov_b32_e32 v1, v3
; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5
; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GCN-NEXT: flat_load_dwordx4 v[6:9], v[2:3]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: flat_store_dwordx4 v[0:1], v[6:9]
+; GCN-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GCN-NEXT: v_readlane_b32 s37, v5, 3
; GCN-NEXT: v_readlane_b32 s36, v5, 2
; GCN-NEXT: v_readlane_b32 s35, v5, 1
More information about the llvm-commits
mailing list