[llvm] [AMDGPU] Analyze REG_SEQUENCE To Remove Redundant CMP Instructions (PR #167364)
Patrick Simmons via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 10 10:32:27 PST 2025
https://github.com/linuxrocks123 created https://github.com/llvm/llvm-project/pull/167364
This PR adds analysis of REG_SEQUENCE instructions to the AMDGPU TII so as to allow the peephole optimizer to remove redundant CMP instructions even if the REG_SEQUENCE pseudo is inbetween the def and use of SCC.
>From ff33b9879cef92983addfbee039e507b00fc080d Mon Sep 17 00:00:00 2001
From: Patrick Simmons <patrick.simmons at amd.com>
Date: Mon, 10 Nov 2025 12:50:36 -0500
Subject: [PATCH 1/5] revised initial commit
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 24 ++++++++++++++++++++++++
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 ++
2 files changed, 26 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..80e6accd1196f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1312,6 +1312,27 @@ Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,
return Reg;
}
+MachineInstr *
+SIInstrInfo::pierceThroughRegSequence(const MachineInstr &MI) const {
+ const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ int64_t SubRegValues[2];
+ bool SubRegIsConst[2];
+ MachineInstr *RealDefs[2];
+ for (unsigned I : {2, 4}) {
+ unsigned ArrayIdx = MI.getOperand(I).getImm() == AMDGPU::sub0 ? 0 : 1;
+ Register Subreg = MI.getOperand(I - 1).getReg();
+ RealDefs[ArrayIdx] = MRI.getUniqueVRegDef(Subreg);
+ SubRegIsConst[ArrayIdx] = getConstValDefinedInReg(
+ *RealDefs[ArrayIdx], Subreg, SubRegValues[ArrayIdx]);
+ }
+
+ for (unsigned I : {0, 1})
+ if (SubRegIsConst[I] && !SubRegValues[I])
+ return RealDefs[(I + 1) % 2];
+
+ return nullptr;
+}
+
bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
const Register Reg,
int64_t &ImmVal) const {
@@ -10676,6 +10697,9 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (!Def || Def->getParent() != CmpInstr.getParent())
return false;
+ if (MachineInstr *RegSequenceDef = pierceThroughRegSequence(*Def))
+ Def = RegSequenceDef;
+
// For S_OP that set SCC = DST!=0, do the transformation
//
// s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 0643b532ea04c..d7d049f722b47 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -714,6 +714,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
}
}
+ MachineInstr *pierceThroughRegSequence(const MachineInstr &MI) const;
+
static bool setsSCCifResultIsNonZero(const MachineInstr &MI) {
switch (MI.getOpcode()) {
case AMDGPU::S_ABSDIFF_I32:
>From b106332650020892edf18d56dc62ede3b65d35c2 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <patrick.simmons at amd.com>
Date: Mon, 10 Nov 2025 13:04:15 -0500
Subject: [PATCH 2/5] Fix
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 3 +++
1 file changed, 3 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 80e6accd1196f..951df6899f5a1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1314,6 +1314,9 @@ Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,
MachineInstr *
SIInstrInfo::pierceThroughRegSequence(const MachineInstr &MI) const {
+ if (MI.getOpcode() != AMDGPU::REG_SEQUENCE)
+ return nullptr;
+
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
int64_t SubRegValues[2];
bool SubRegIsConst[2];
>From e4f7c8ec86b74afda843e4d9a111279000431a95 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <patrick.simmons at amd.com>
Date: Mon, 10 Nov 2025 13:06:06 -0500
Subject: [PATCH 3/5] Add testcase passing on main
---
.../AMDGPU/redundant-cmp-reg-sequence.ll | 23 +++++++++++++++++++
1 file changed, 23 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/redundant-cmp-reg-sequence.ll
diff --git a/llvm/test/CodeGen/AMDGPU/redundant-cmp-reg-sequence.ll b/llvm/test/CodeGen/AMDGPU/redundant-cmp-reg-sequence.ll
new file mode 100644
index 0000000000000..00e479fe2eccc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/redundant-cmp-reg-sequence.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
+define amdgpu_ps i64 @ordertest(i64 inreg %val0) {
+; CHECK-LABEL: ordertest:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_lshr_b32 s0, s1, 2
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
+; CHECK-NEXT: v_lshrrev_b64 v[0:1], v2, s[0:1]
+; CHECK-NEXT: v_xor_b32_e32 v0, v2, v0
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %shl = lshr i64 %val0, 34
+ %result = and i64 %shl, 4294967295
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i64
+ %param0 = lshr i64 %shl, %zext
+ %param = and i64 %param0, 4294967295
+ %xory = xor i64 %zext, %param
+ ret i64 %xory
+}
>From 89ee19eee9c06c1d366ecaf67e8d2308b159c853 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <patrick.simmons at amd.com>
Date: Mon, 10 Nov 2025 13:06:34 -0500
Subject: [PATCH 4/5] Add testcase passing on branch
---
llvm/test/CodeGen/AMDGPU/redundant-cmp-reg-sequence.ll | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/redundant-cmp-reg-sequence.ll b/llvm/test/CodeGen/AMDGPU/redundant-cmp-reg-sequence.ll
index 00e479fe2eccc..750a9027d47a4 100644
--- a/llvm/test/CodeGen/AMDGPU/redundant-cmp-reg-sequence.ll
+++ b/llvm/test/CodeGen/AMDGPU/redundant-cmp-reg-sequence.ll
@@ -4,9 +4,8 @@ define amdgpu_ps i64 @ordertest(i64 inreg %val0) {
; CHECK-LABEL: ordertest:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshr_b32 s0, s1, 2
-; CHECK-NEXT: s_mov_b32 s1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
; CHECK-NEXT: v_lshrrev_b64 v[0:1], v2, s[0:1]
; CHECK-NEXT: v_xor_b32_e32 v0, v2, v0
>From b28755abe27fbb9b0a463ece0dc2dbeebbaa50e8 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <patrick.simmons at amd.com>
Date: Mon, 10 Nov 2025 13:29:14 -0500
Subject: [PATCH 5/5] Update testcases
---
.../test/CodeGen/AMDGPU/carryout-selection.ll | 346 +++++++++---------
llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 4 -
llvm/test/CodeGen/AMDGPU/srem.ll | 98 ++---
llvm/test/CodeGen/AMDGPU/wave32.ll | 115 +++---
4 files changed, 263 insertions(+), 300 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index b96de173dc8c6..e43967626c764 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -2120,8 +2120,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5]
-; VI-NEXT: s_mov_b32 s6, 0
-; VI-NEXT: s_cmp_lg_u64 s[6:7], 0
; VI-NEXT: s_cbranch_scc0 .LBB16_3
; VI-NEXT: ; %bb.1:
; VI-NEXT: v_cvt_f32_u32_e32 v0, s4
@@ -2272,8 +2270,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7]
-; GFX9-NEXT: s_mov_b32 s4, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-NEXT: s_cbranch_scc0 .LBB16_4
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
@@ -2422,10 +2418,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: s_clause 0x1
; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1010-NEXT: s_mov_b32 s8, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7]
-; GFX1010-NEXT: s_mov_b32 s4, 0
-; GFX1010-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX1010-NEXT: s_cbranch_scc0 .LBB16_4
; GFX1010-NEXT: ; %bb.1:
; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s6
@@ -2440,71 +2435,71 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX1010-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX1010-NEXT: v_readfirstlane_b32 s5, v1
-; GFX1010-NEXT: v_readfirstlane_b32 s8, v0
-; GFX1010-NEXT: s_mul_i32 s11, s9, s5
-; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s8
-; GFX1010-NEXT: s_mul_i32 s12, s10, s8
+; GFX1010-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1010-NEXT: v_readfirstlane_b32 s5, v0
+; GFX1010-NEXT: s_mul_i32 s11, s9, s4
+; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s5
+; GFX1010-NEXT: s_mul_i32 s12, s10, s5
; GFX1010-NEXT: s_add_i32 s11, s13, s11
-; GFX1010-NEXT: s_mul_i32 s14, s9, s8
+; GFX1010-NEXT: s_mul_i32 s14, s9, s5
; GFX1010-NEXT: s_add_i32 s11, s11, s12
-; GFX1010-NEXT: s_mul_hi_u32 s13, s8, s14
-; GFX1010-NEXT: s_mul_i32 s16, s8, s11
-; GFX1010-NEXT: s_mul_hi_u32 s15, s5, s14
-; GFX1010-NEXT: s_mul_i32 s12, s5, s14
-; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s11
+; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s14
+; GFX1010-NEXT: s_mul_i32 s16, s5, s11
+; GFX1010-NEXT: s_mul_hi_u32 s15, s4, s14
+; GFX1010-NEXT: s_mul_i32 s12, s4, s14
+; GFX1010-NEXT: s_mul_hi_u32 s14, s5, s11
; GFX1010-NEXT: s_add_u32 s13, s13, s16
; GFX1010-NEXT: s_addc_u32 s14, 0, s14
-; GFX1010-NEXT: s_mul_hi_u32 s17, s5, s11
+; GFX1010-NEXT: s_mul_hi_u32 s17, s4, s11
; GFX1010-NEXT: s_add_u32 s12, s13, s12
-; GFX1010-NEXT: s_mul_i32 s11, s5, s11
+; GFX1010-NEXT: s_mul_i32 s11, s4, s11
; GFX1010-NEXT: s_addc_u32 s12, s14, s15
; GFX1010-NEXT: s_addc_u32 s13, s17, 0
; GFX1010-NEXT: s_add_u32 s11, s12, s11
; GFX1010-NEXT: s_addc_u32 s12, 0, s13
-; GFX1010-NEXT: s_add_u32 s8, s8, s11
-; GFX1010-NEXT: s_addc_u32 s5, s5, s12
-; GFX1010-NEXT: s_mul_hi_u32 s11, s9, s8
-; GFX1010-NEXT: s_mul_i32 s12, s9, s8
-; GFX1010-NEXT: s_mul_i32 s9, s9, s5
-; GFX1010-NEXT: s_mul_i32 s10, s10, s8
+; GFX1010-NEXT: s_add_u32 s5, s5, s11
+; GFX1010-NEXT: s_addc_u32 s4, s4, s12
+; GFX1010-NEXT: s_mul_hi_u32 s11, s9, s5
+; GFX1010-NEXT: s_mul_i32 s12, s9, s5
+; GFX1010-NEXT: s_mul_i32 s9, s9, s4
+; GFX1010-NEXT: s_mul_i32 s10, s10, s5
; GFX1010-NEXT: s_add_i32 s9, s11, s9
-; GFX1010-NEXT: s_mul_i32 s11, s5, s12
+; GFX1010-NEXT: s_mul_i32 s11, s4, s12
; GFX1010-NEXT: s_add_i32 s9, s9, s10
-; GFX1010-NEXT: s_mul_hi_u32 s10, s8, s12
-; GFX1010-NEXT: s_mul_i32 s15, s8, s9
-; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s9
+; GFX1010-NEXT: s_mul_hi_u32 s10, s5, s12
+; GFX1010-NEXT: s_mul_i32 s15, s5, s9
+; GFX1010-NEXT: s_mul_hi_u32 s14, s5, s9
; GFX1010-NEXT: s_add_u32 s10, s10, s15
-; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s12
+; GFX1010-NEXT: s_mul_hi_u32 s13, s4, s12
; GFX1010-NEXT: s_addc_u32 s14, 0, s14
-; GFX1010-NEXT: s_mul_hi_u32 s12, s5, s9
+; GFX1010-NEXT: s_mul_hi_u32 s12, s4, s9
; GFX1010-NEXT: s_add_u32 s10, s10, s11
-; GFX1010-NEXT: s_mul_i32 s9, s5, s9
+; GFX1010-NEXT: s_mul_i32 s9, s4, s9
; GFX1010-NEXT: s_addc_u32 s10, s14, s13
; GFX1010-NEXT: s_addc_u32 s11, s12, 0
; GFX1010-NEXT: s_add_u32 s9, s10, s9
; GFX1010-NEXT: s_addc_u32 s10, 0, s11
-; GFX1010-NEXT: s_add_u32 s8, s8, s9
-; GFX1010-NEXT: s_addc_u32 s5, s5, s10
-; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s8
-; GFX1010-NEXT: s_mul_i32 s12, s2, s5
-; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s5
-; GFX1010-NEXT: s_mul_hi_u32 s10, s3, s8
-; GFX1010-NEXT: s_mul_i32 s8, s3, s8
+; GFX1010-NEXT: s_add_u32 s5, s5, s9
+; GFX1010-NEXT: s_addc_u32 s4, s4, s10
+; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s5
+; GFX1010-NEXT: s_mul_i32 s12, s2, s4
+; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s4
+; GFX1010-NEXT: s_mul_hi_u32 s10, s3, s5
+; GFX1010-NEXT: s_mul_i32 s5, s3, s5
; GFX1010-NEXT: s_add_u32 s9, s9, s12
; GFX1010-NEXT: s_addc_u32 s11, 0, s11
-; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s5
-; GFX1010-NEXT: s_add_u32 s8, s9, s8
-; GFX1010-NEXT: s_mul_i32 s5, s3, s5
-; GFX1010-NEXT: s_addc_u32 s8, s11, s10
+; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s4
+; GFX1010-NEXT: s_add_u32 s5, s9, s5
+; GFX1010-NEXT: s_mul_i32 s4, s3, s4
+; GFX1010-NEXT: s_addc_u32 s5, s11, s10
; GFX1010-NEXT: s_addc_u32 s9, s13, 0
-; GFX1010-NEXT: s_add_u32 s5, s8, s5
-; GFX1010-NEXT: s_addc_u32 s8, 0, s9
-; GFX1010-NEXT: s_mul_hi_u32 s9, s6, s5
-; GFX1010-NEXT: s_mul_i32 s10, s6, s8
-; GFX1010-NEXT: s_mul_i32 s11, s7, s5
-; GFX1010-NEXT: s_add_i32 s9, s9, s10
+; GFX1010-NEXT: s_add_u32 s4, s5, s4
+; GFX1010-NEXT: s_addc_u32 s5, 0, s9
+; GFX1010-NEXT: s_mul_hi_u32 s9, s6, s4
; GFX1010-NEXT: s_mul_i32 s10, s6, s5
+; GFX1010-NEXT: s_mul_i32 s11, s7, s4
+; GFX1010-NEXT: s_add_i32 s9, s9, s10
+; GFX1010-NEXT: s_mul_i32 s10, s6, s4
; GFX1010-NEXT: s_add_i32 s9, s9, s11
; GFX1010-NEXT: s_sub_i32 s11, s3, s9
; GFX1010-NEXT: s_sub_u32 s10, s2, s10
@@ -2518,10 +2513,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: s_cselect_b32 s13, -1, 0
; GFX1010-NEXT: s_cmp_eq_u32 s11, s7
; GFX1010-NEXT: s_cselect_b32 s11, s13, s14
-; GFX1010-NEXT: s_add_u32 s13, s5, 1
-; GFX1010-NEXT: s_addc_u32 s14, s8, 0
-; GFX1010-NEXT: s_add_u32 s15, s5, 2
-; GFX1010-NEXT: s_addc_u32 s16, s8, 0
+; GFX1010-NEXT: s_add_u32 s13, s4, 1
+; GFX1010-NEXT: s_addc_u32 s14, s5, 0
+; GFX1010-NEXT: s_add_u32 s15, s4, 2
+; GFX1010-NEXT: s_addc_u32 s16, s5, 0
; GFX1010-NEXT: s_cmp_lg_u32 s11, 0
; GFX1010-NEXT: s_cselect_b32 s11, s15, s13
; GFX1010-NEXT: s_cselect_b32 s13, s16, s14
@@ -2534,14 +2529,13 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: s_cmp_eq_u32 s3, s7
; GFX1010-NEXT: s_cselect_b32 s3, s10, s9
; GFX1010-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1010-NEXT: s_cselect_b32 s9, s13, s8
-; GFX1010-NEXT: s_cselect_b32 s8, s11, s5
-; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4
+; GFX1010-NEXT: s_cselect_b32 s5, s13, s5
+; GFX1010-NEXT: s_cselect_b32 s4, s11, s4
+; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8
; GFX1010-NEXT: s_cbranch_vccnz .LBB16_3
; GFX1010-NEXT: .LBB16_2:
; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX1010-NEXT: s_sub_i32 s4, 0, s6
-; GFX1010-NEXT: s_mov_b32 s9, 0
; GFX1010-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX1010-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0
@@ -2559,15 +2553,16 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: s_cselect_b32 s2, s5, s2
; GFX1010-NEXT: s_add_i32 s4, s3, 1
; GFX1010-NEXT: s_cmp_ge_u32 s2, s6
-; GFX1010-NEXT: s_cselect_b32 s8, s4, s3
+; GFX1010-NEXT: s_mov_b32 s5, 0
+; GFX1010-NEXT: s_cselect_b32 s4, s4, s3
; GFX1010-NEXT: .LBB16_3:
-; GFX1010-NEXT: v_mov_b32_e32 v0, s8
+; GFX1010-NEXT: v_mov_b32_e32 v0, s4
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
-; GFX1010-NEXT: v_mov_b32_e32 v1, s9
+; GFX1010-NEXT: v_mov_b32_e32 v1, s5
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: s_endpgm
; GFX1010-NEXT: .LBB16_4:
-; GFX1010-NEXT: ; implicit-def: $sgpr8_sgpr9
+; GFX1010-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX1010-NEXT: s_branch .LBB16_2
;
; GFX1030W32-LABEL: sudiv64:
@@ -2575,10 +2570,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: s_clause 0x1
; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX1030W32-NEXT: s_mov_b32 s8, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W32-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5]
-; GFX1030W32-NEXT: s_mov_b32 s6, 0
-; GFX1030W32-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX1030W32-NEXT: s_cbranch_scc0 .LBB16_4
; GFX1030W32-NEXT: ; %bb.1:
; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v0, s4
@@ -2593,71 +2587,71 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0
; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX1030W32-NEXT: v_readfirstlane_b32 s7, v1
-; GFX1030W32-NEXT: v_readfirstlane_b32 s8, v0
-; GFX1030W32-NEXT: s_mul_i32 s11, s9, s7
-; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s8
-; GFX1030W32-NEXT: s_mul_i32 s12, s10, s8
+; GFX1030W32-NEXT: v_readfirstlane_b32 s6, v1
+; GFX1030W32-NEXT: v_readfirstlane_b32 s7, v0
+; GFX1030W32-NEXT: s_mul_i32 s11, s9, s6
+; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s7
+; GFX1030W32-NEXT: s_mul_i32 s12, s10, s7
; GFX1030W32-NEXT: s_add_i32 s11, s13, s11
-; GFX1030W32-NEXT: s_mul_i32 s14, s9, s8
+; GFX1030W32-NEXT: s_mul_i32 s14, s9, s7
; GFX1030W32-NEXT: s_add_i32 s11, s11, s12
-; GFX1030W32-NEXT: s_mul_hi_u32 s13, s8, s14
-; GFX1030W32-NEXT: s_mul_i32 s16, s8, s11
-; GFX1030W32-NEXT: s_mul_hi_u32 s15, s7, s14
-; GFX1030W32-NEXT: s_mul_i32 s12, s7, s14
-; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s11
+; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s14
+; GFX1030W32-NEXT: s_mul_i32 s16, s7, s11
+; GFX1030W32-NEXT: s_mul_hi_u32 s15, s6, s14
+; GFX1030W32-NEXT: s_mul_i32 s12, s6, s14
+; GFX1030W32-NEXT: s_mul_hi_u32 s14, s7, s11
; GFX1030W32-NEXT: s_add_u32 s13, s13, s16
; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14
-; GFX1030W32-NEXT: s_mul_hi_u32 s17, s7, s11
+; GFX1030W32-NEXT: s_mul_hi_u32 s17, s6, s11
; GFX1030W32-NEXT: s_add_u32 s12, s13, s12
-; GFX1030W32-NEXT: s_mul_i32 s11, s7, s11
+; GFX1030W32-NEXT: s_mul_i32 s11, s6, s11
; GFX1030W32-NEXT: s_addc_u32 s12, s14, s15
; GFX1030W32-NEXT: s_addc_u32 s13, s17, 0
; GFX1030W32-NEXT: s_add_u32 s11, s12, s11
; GFX1030W32-NEXT: s_addc_u32 s12, 0, s13
-; GFX1030W32-NEXT: s_add_u32 s8, s8, s11
-; GFX1030W32-NEXT: s_addc_u32 s7, s7, s12
-; GFX1030W32-NEXT: s_mul_hi_u32 s11, s9, s8
-; GFX1030W32-NEXT: s_mul_i32 s12, s9, s8
-; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7
-; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8
+; GFX1030W32-NEXT: s_add_u32 s7, s7, s11
+; GFX1030W32-NEXT: s_addc_u32 s6, s6, s12
+; GFX1030W32-NEXT: s_mul_hi_u32 s11, s9, s7
+; GFX1030W32-NEXT: s_mul_i32 s12, s9, s7
+; GFX1030W32-NEXT: s_mul_i32 s9, s9, s6
+; GFX1030W32-NEXT: s_mul_i32 s10, s10, s7
; GFX1030W32-NEXT: s_add_i32 s9, s11, s9
-; GFX1030W32-NEXT: s_mul_i32 s11, s7, s12
+; GFX1030W32-NEXT: s_mul_i32 s11, s6, s12
; GFX1030W32-NEXT: s_add_i32 s9, s9, s10
-; GFX1030W32-NEXT: s_mul_hi_u32 s10, s8, s12
-; GFX1030W32-NEXT: s_mul_i32 s15, s8, s9
-; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s9
+; GFX1030W32-NEXT: s_mul_hi_u32 s10, s7, s12
+; GFX1030W32-NEXT: s_mul_i32 s15, s7, s9
+; GFX1030W32-NEXT: s_mul_hi_u32 s14, s7, s9
; GFX1030W32-NEXT: s_add_u32 s10, s10, s15
-; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s12
+; GFX1030W32-NEXT: s_mul_hi_u32 s13, s6, s12
; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14
-; GFX1030W32-NEXT: s_mul_hi_u32 s12, s7, s9
+; GFX1030W32-NEXT: s_mul_hi_u32 s12, s6, s9
; GFX1030W32-NEXT: s_add_u32 s10, s10, s11
-; GFX1030W32-NEXT: s_mul_i32 s9, s7, s9
+; GFX1030W32-NEXT: s_mul_i32 s9, s6, s9
; GFX1030W32-NEXT: s_addc_u32 s10, s14, s13
; GFX1030W32-NEXT: s_addc_u32 s11, s12, 0
; GFX1030W32-NEXT: s_add_u32 s9, s10, s9
; GFX1030W32-NEXT: s_addc_u32 s10, 0, s11
-; GFX1030W32-NEXT: s_add_u32 s8, s8, s9
-; GFX1030W32-NEXT: s_addc_u32 s7, s7, s10
-; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s8
-; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7
-; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s7
-; GFX1030W32-NEXT: s_mul_hi_u32 s10, s3, s8
-; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8
+; GFX1030W32-NEXT: s_add_u32 s7, s7, s9
+; GFX1030W32-NEXT: s_addc_u32 s6, s6, s10
+; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s7
+; GFX1030W32-NEXT: s_mul_i32 s12, s2, s6
+; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s6
+; GFX1030W32-NEXT: s_mul_hi_u32 s10, s3, s7
+; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7
; GFX1030W32-NEXT: s_add_u32 s9, s9, s12
; GFX1030W32-NEXT: s_addc_u32 s11, 0, s11
-; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s7
-; GFX1030W32-NEXT: s_add_u32 s8, s9, s8
-; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7
-; GFX1030W32-NEXT: s_addc_u32 s8, s11, s10
+; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s6
+; GFX1030W32-NEXT: s_add_u32 s7, s9, s7
+; GFX1030W32-NEXT: s_mul_i32 s6, s3, s6
+; GFX1030W32-NEXT: s_addc_u32 s7, s11, s10
; GFX1030W32-NEXT: s_addc_u32 s9, s13, 0
-; GFX1030W32-NEXT: s_add_u32 s7, s8, s7
-; GFX1030W32-NEXT: s_addc_u32 s8, 0, s9
-; GFX1030W32-NEXT: s_mul_hi_u32 s9, s4, s7
-; GFX1030W32-NEXT: s_mul_i32 s10, s4, s8
-; GFX1030W32-NEXT: s_mul_i32 s11, s5, s7
-; GFX1030W32-NEXT: s_add_i32 s9, s9, s10
+; GFX1030W32-NEXT: s_add_u32 s6, s7, s6
+; GFX1030W32-NEXT: s_addc_u32 s7, 0, s9
+; GFX1030W32-NEXT: s_mul_hi_u32 s9, s4, s6
; GFX1030W32-NEXT: s_mul_i32 s10, s4, s7
+; GFX1030W32-NEXT: s_mul_i32 s11, s5, s6
+; GFX1030W32-NEXT: s_add_i32 s9, s9, s10
+; GFX1030W32-NEXT: s_mul_i32 s10, s4, s6
; GFX1030W32-NEXT: s_add_i32 s9, s9, s11
; GFX1030W32-NEXT: s_sub_i32 s11, s3, s9
; GFX1030W32-NEXT: s_sub_u32 s10, s2, s10
@@ -2671,10 +2665,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: s_cselect_b32 s13, -1, 0
; GFX1030W32-NEXT: s_cmp_eq_u32 s11, s5
; GFX1030W32-NEXT: s_cselect_b32 s11, s13, s14
-; GFX1030W32-NEXT: s_add_u32 s13, s7, 1
-; GFX1030W32-NEXT: s_addc_u32 s14, s8, 0
-; GFX1030W32-NEXT: s_add_u32 s15, s7, 2
-; GFX1030W32-NEXT: s_addc_u32 s16, s8, 0
+; GFX1030W32-NEXT: s_add_u32 s13, s6, 1
+; GFX1030W32-NEXT: s_addc_u32 s14, s7, 0
+; GFX1030W32-NEXT: s_add_u32 s15, s6, 2
+; GFX1030W32-NEXT: s_addc_u32 s16, s7, 0
; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0
; GFX1030W32-NEXT: s_cselect_b32 s11, s15, s13
; GFX1030W32-NEXT: s_cselect_b32 s13, s16, s14
@@ -2687,14 +2681,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: s_cmp_eq_u32 s3, s5
; GFX1030W32-NEXT: s_cselect_b32 s3, s10, s9
; GFX1030W32-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1030W32-NEXT: s_cselect_b32 s9, s13, s8
-; GFX1030W32-NEXT: s_cselect_b32 s8, s11, s7
-; GFX1030W32-NEXT: s_andn2_b32 vcc_lo, exec_lo, s6
+; GFX1030W32-NEXT: s_cselect_b32 s7, s13, s7
+; GFX1030W32-NEXT: s_cselect_b32 s6, s11, s6
+; GFX1030W32-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8
; GFX1030W32-NEXT: s_cbranch_vccnz .LBB16_3
; GFX1030W32-NEXT: .LBB16_2:
; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v0, s4
; GFX1030W32-NEXT: s_sub_i32 s5, 0, s4
-; GFX1030W32-NEXT: s_mov_b32 s9, 0
+; GFX1030W32-NEXT: s_mov_b32 s7, 0
; GFX1030W32-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX1030W32-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v0, v0
@@ -2712,15 +2706,15 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: s_cselect_b32 s2, s6, s2
; GFX1030W32-NEXT: s_add_i32 s5, s3, 1
; GFX1030W32-NEXT: s_cmp_ge_u32 s2, s4
-; GFX1030W32-NEXT: s_cselect_b32 s8, s5, s3
+; GFX1030W32-NEXT: s_cselect_b32 s6, s5, s3
; GFX1030W32-NEXT: .LBB16_3:
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s8
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s6
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s9
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s7
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W32-NEXT: s_endpgm
; GFX1030W32-NEXT: .LBB16_4:
-; GFX1030W32-NEXT: ; implicit-def: $sgpr8_sgpr9
+; GFX1030W32-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX1030W32-NEXT: s_branch .LBB16_2
;
; GFX1030W64-LABEL: sudiv64:
@@ -2730,8 +2724,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W64-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5]
-; GFX1030W64-NEXT: s_mov_b32 s6, 0
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX1030W64-NEXT: s_cbranch_scc0 .LBB16_4
; GFX1030W64-NEXT: ; %bb.1:
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4
@@ -2880,11 +2872,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: s_mov_b32 s8, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5]
-; GFX11-NEXT: s_mov_b32 s6, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX11-NEXT: s_cbranch_scc0 .LBB16_4
; GFX11-NEXT: ; %bb.1:
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4
@@ -2904,71 +2894,71 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_readfirstlane_b32 s7, v1
-; GFX11-NEXT: v_readfirstlane_b32 s8, v0
-; GFX11-NEXT: s_mul_i32 s11, s9, s7
-; GFX11-NEXT: s_mul_hi_u32 s13, s9, s8
-; GFX11-NEXT: s_mul_i32 s12, s10, s8
+; GFX11-NEXT: v_readfirstlane_b32 s6, v1
+; GFX11-NEXT: v_readfirstlane_b32 s7, v0
+; GFX11-NEXT: s_mul_i32 s11, s9, s6
+; GFX11-NEXT: s_mul_hi_u32 s13, s9, s7
+; GFX11-NEXT: s_mul_i32 s12, s10, s7
; GFX11-NEXT: s_add_i32 s11, s13, s11
-; GFX11-NEXT: s_mul_i32 s14, s9, s8
+; GFX11-NEXT: s_mul_i32 s14, s9, s7
; GFX11-NEXT: s_add_i32 s11, s11, s12
-; GFX11-NEXT: s_mul_hi_u32 s13, s8, s14
-; GFX11-NEXT: s_mul_i32 s16, s8, s11
-; GFX11-NEXT: s_mul_hi_u32 s15, s7, s14
-; GFX11-NEXT: s_mul_i32 s12, s7, s14
-; GFX11-NEXT: s_mul_hi_u32 s14, s8, s11
+; GFX11-NEXT: s_mul_hi_u32 s13, s7, s14
+; GFX11-NEXT: s_mul_i32 s16, s7, s11
+; GFX11-NEXT: s_mul_hi_u32 s15, s6, s14
+; GFX11-NEXT: s_mul_i32 s12, s6, s14
+; GFX11-NEXT: s_mul_hi_u32 s14, s7, s11
; GFX11-NEXT: s_add_u32 s13, s13, s16
; GFX11-NEXT: s_addc_u32 s14, 0, s14
-; GFX11-NEXT: s_mul_hi_u32 s17, s7, s11
+; GFX11-NEXT: s_mul_hi_u32 s17, s6, s11
; GFX11-NEXT: s_add_u32 s12, s13, s12
-; GFX11-NEXT: s_mul_i32 s11, s7, s11
+; GFX11-NEXT: s_mul_i32 s11, s6, s11
; GFX11-NEXT: s_addc_u32 s12, s14, s15
; GFX11-NEXT: s_addc_u32 s13, s17, 0
; GFX11-NEXT: s_add_u32 s11, s12, s11
; GFX11-NEXT: s_addc_u32 s12, 0, s13
-; GFX11-NEXT: s_add_u32 s8, s8, s11
-; GFX11-NEXT: s_addc_u32 s7, s7, s12
-; GFX11-NEXT: s_mul_hi_u32 s11, s9, s8
-; GFX11-NEXT: s_mul_i32 s12, s9, s8
-; GFX11-NEXT: s_mul_i32 s9, s9, s7
-; GFX11-NEXT: s_mul_i32 s10, s10, s8
+; GFX11-NEXT: s_add_u32 s7, s7, s11
+; GFX11-NEXT: s_addc_u32 s6, s6, s12
+; GFX11-NEXT: s_mul_hi_u32 s11, s9, s7
+; GFX11-NEXT: s_mul_i32 s12, s9, s7
+; GFX11-NEXT: s_mul_i32 s9, s9, s6
+; GFX11-NEXT: s_mul_i32 s10, s10, s7
; GFX11-NEXT: s_add_i32 s9, s11, s9
-; GFX11-NEXT: s_mul_i32 s11, s7, s12
+; GFX11-NEXT: s_mul_i32 s11, s6, s12
; GFX11-NEXT: s_add_i32 s9, s9, s10
-; GFX11-NEXT: s_mul_hi_u32 s10, s8, s12
-; GFX11-NEXT: s_mul_i32 s15, s8, s9
-; GFX11-NEXT: s_mul_hi_u32 s14, s8, s9
+; GFX11-NEXT: s_mul_hi_u32 s10, s7, s12
+; GFX11-NEXT: s_mul_i32 s15, s7, s9
+; GFX11-NEXT: s_mul_hi_u32 s14, s7, s9
; GFX11-NEXT: s_add_u32 s10, s10, s15
-; GFX11-NEXT: s_mul_hi_u32 s13, s7, s12
+; GFX11-NEXT: s_mul_hi_u32 s13, s6, s12
; GFX11-NEXT: s_addc_u32 s14, 0, s14
-; GFX11-NEXT: s_mul_hi_u32 s12, s7, s9
+; GFX11-NEXT: s_mul_hi_u32 s12, s6, s9
; GFX11-NEXT: s_add_u32 s10, s10, s11
-; GFX11-NEXT: s_mul_i32 s9, s7, s9
+; GFX11-NEXT: s_mul_i32 s9, s6, s9
; GFX11-NEXT: s_addc_u32 s10, s14, s13
; GFX11-NEXT: s_addc_u32 s11, s12, 0
; GFX11-NEXT: s_add_u32 s9, s10, s9
; GFX11-NEXT: s_addc_u32 s10, 0, s11
-; GFX11-NEXT: s_add_u32 s8, s8, s9
-; GFX11-NEXT: s_addc_u32 s7, s7, s10
-; GFX11-NEXT: s_mul_hi_u32 s9, s2, s8
-; GFX11-NEXT: s_mul_i32 s12, s2, s7
-; GFX11-NEXT: s_mul_hi_u32 s11, s2, s7
-; GFX11-NEXT: s_mul_hi_u32 s10, s3, s8
-; GFX11-NEXT: s_mul_i32 s8, s3, s8
+; GFX11-NEXT: s_add_u32 s7, s7, s9
+; GFX11-NEXT: s_addc_u32 s6, s6, s10
+; GFX11-NEXT: s_mul_hi_u32 s9, s2, s7
+; GFX11-NEXT: s_mul_i32 s12, s2, s6
+; GFX11-NEXT: s_mul_hi_u32 s11, s2, s6
+; GFX11-NEXT: s_mul_hi_u32 s10, s3, s7
+; GFX11-NEXT: s_mul_i32 s7, s3, s7
; GFX11-NEXT: s_add_u32 s9, s9, s12
; GFX11-NEXT: s_addc_u32 s11, 0, s11
-; GFX11-NEXT: s_mul_hi_u32 s13, s3, s7
-; GFX11-NEXT: s_add_u32 s8, s9, s8
-; GFX11-NEXT: s_mul_i32 s7, s3, s7
-; GFX11-NEXT: s_addc_u32 s8, s11, s10
+; GFX11-NEXT: s_mul_hi_u32 s13, s3, s6
+; GFX11-NEXT: s_add_u32 s7, s9, s7
+; GFX11-NEXT: s_mul_i32 s6, s3, s6
+; GFX11-NEXT: s_addc_u32 s7, s11, s10
; GFX11-NEXT: s_addc_u32 s9, s13, 0
-; GFX11-NEXT: s_add_u32 s7, s8, s7
-; GFX11-NEXT: s_addc_u32 s8, 0, s9
-; GFX11-NEXT: s_mul_hi_u32 s9, s4, s7
-; GFX11-NEXT: s_mul_i32 s10, s4, s8
-; GFX11-NEXT: s_mul_i32 s11, s5, s7
-; GFX11-NEXT: s_add_i32 s9, s9, s10
+; GFX11-NEXT: s_add_u32 s6, s7, s6
+; GFX11-NEXT: s_addc_u32 s7, 0, s9
+; GFX11-NEXT: s_mul_hi_u32 s9, s4, s6
; GFX11-NEXT: s_mul_i32 s10, s4, s7
+; GFX11-NEXT: s_mul_i32 s11, s5, s6
+; GFX11-NEXT: s_add_i32 s9, s9, s10
+; GFX11-NEXT: s_mul_i32 s10, s4, s6
; GFX11-NEXT: s_add_i32 s9, s9, s11
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_sub_i32 s11, s3, s9
@@ -2984,10 +2974,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_cselect_b32 s13, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s11, s5
; GFX11-NEXT: s_cselect_b32 s11, s13, s14
-; GFX11-NEXT: s_add_u32 s13, s7, 1
-; GFX11-NEXT: s_addc_u32 s14, s8, 0
-; GFX11-NEXT: s_add_u32 s15, s7, 2
-; GFX11-NEXT: s_addc_u32 s16, s8, 0
+; GFX11-NEXT: s_add_u32 s13, s6, 1
+; GFX11-NEXT: s_addc_u32 s14, s7, 0
+; GFX11-NEXT: s_add_u32 s15, s6, 2
+; GFX11-NEXT: s_addc_u32 s16, s7, 0
; GFX11-NEXT: s_cmp_lg_u32 s11, 0
; GFX11-NEXT: s_cselect_b32 s11, s15, s13
; GFX11-NEXT: s_cselect_b32 s13, s16, s14
@@ -3002,14 +2992,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_cselect_b32 s3, s10, s9
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmp_lg_u32 s3, 0
-; GFX11-NEXT: s_cselect_b32 s9, s13, s8
-; GFX11-NEXT: s_cselect_b32 s8, s11, s7
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6
+; GFX11-NEXT: s_cselect_b32 s7, s13, s7
+; GFX11-NEXT: s_cselect_b32 s6, s11, s6
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
; GFX11-NEXT: s_cbranch_vccnz .LBB16_3
; GFX11-NEXT: .LBB16_2:
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4
; GFX11-NEXT: s_sub_i32 s5, 0, s4
-; GFX11-NEXT: s_mov_b32 s9, 0
+; GFX11-NEXT: s_mov_b32 s7, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -3032,15 +3022,15 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_cselect_b32 s2, s6, s2
; GFX11-NEXT: s_add_i32 s5, s3, 1
; GFX11-NEXT: s_cmp_ge_u32 s2, s4
-; GFX11-NEXT: s_cselect_b32 s8, s5, s3
+; GFX11-NEXT: s_cselect_b32 s6, s5, s3
; GFX11-NEXT: .LBB16_3:
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, s8
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX11-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-NEXT: .LBB16_4:
-; GFX11-NEXT: ; implicit-def: $sgpr8_sgpr9
+; GFX11-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX11-NEXT: s_branch .LBB16_2
;
; GFX1250-LABEL: sudiv64:
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index 0166d7ac7ddc2..1f965c16ef4f2 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -403,8 +403,6 @@ define amdgpu_ps i32 @bfe_i64(i64 inreg %val0) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x80000
; CHECK-NEXT: s_and_b32 s0, s0, 0xff
-; CHECK-NEXT: s_mov_b32 s1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -440,7 +438,6 @@ define amdgpu_ps i32 @bfe_u64(i64 inreg %val0) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, s0, 0xff
; CHECK-NEXT: s_mov_b32 s1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -522,7 +519,6 @@ define amdgpu_ps i32 @bcnt164(i64 inreg %val0) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
; CHECK-NEXT: s_mov_b32 s1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index e12e31b14e97d..3c3d634c96410 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -1501,8 +1501,6 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: v_readfirstlane_b32 s3, v3
; GCN-NEXT: v_readfirstlane_b32 s2, v2
; GCN-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3]
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_cmp_lg_u64 s[6:7], 0
; GCN-NEXT: s_cbranch_scc0 .LBB8_4
; GCN-NEXT: ; %bb.1:
; GCN-NEXT: s_ashr_i32 s6, s3, 31
@@ -1832,8 +1830,6 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; TONGA-NEXT: v_readfirstlane_b32 s3, v3
; TONGA-NEXT: v_readfirstlane_b32 s2, v2
; TONGA-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3]
-; TONGA-NEXT: s_mov_b32 s6, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[6:7], 0
; TONGA-NEXT: s_cbranch_scc0 .LBB8_3
; TONGA-NEXT: ; %bb.1:
; TONGA-NEXT: s_ashr_i32 s6, s3, 31
@@ -2701,12 +2697,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s11, v5
; GCN-NEXT: v_readfirstlane_b32 s10, v4
-; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[8:9]
-; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_readfirstlane_b32 s3, v3
; GCN-NEXT: v_readfirstlane_b32 s2, v2
; GCN-NEXT: v_readfirstlane_b32 s5, v7
-; GCN-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[8:9]
; GCN-NEXT: v_readfirstlane_b32 s4, v6
; GCN-NEXT: s_cbranch_scc0 .LBB10_6
; GCN-NEXT: ; %bb.1:
@@ -2855,8 +2849,6 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s6, s9, s6
; GCN-NEXT: .LBB10_3:
; GCN-NEXT: s_or_b64 s[8:9], s[4:5], s[2:3]
-; GCN-NEXT: s_mov_b32 s8, 0
-; GCN-NEXT: s_cmp_lg_u64 s[8:9], 0
; GCN-NEXT: s_cbranch_scc0 .LBB10_7
; GCN-NEXT: ; %bb.4:
; GCN-NEXT: s_ashr_i32 s8, s3, 31
@@ -3344,8 +3336,6 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_readfirstlane_b32 s3, v5
; TONGA-NEXT: v_readfirstlane_b32 s2, v4
; TONGA-NEXT: s_or_b64 s[6:7], s[2:3], s[0:1]
-; TONGA-NEXT: s_mov_b32 s6, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[6:7], 0
; TONGA-NEXT: s_cbranch_scc0 .LBB10_3
; TONGA-NEXT: ; %bb.1:
; TONGA-NEXT: s_ashr_i32 s6, s1, 31
@@ -4878,8 +4868,6 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s19, v13
; GCN-NEXT: v_readfirstlane_b32 s18, v12
-; GCN-NEXT: s_or_b64 s[6:7], s[18:19], s[16:17]
-; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_readfirstlane_b32 s2, v2
; GCN-NEXT: v_readfirstlane_b32 s9, v1
; GCN-NEXT: v_readfirstlane_b32 s8, v0
@@ -4890,9 +4878,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_readfirstlane_b32 s11, v9
; GCN-NEXT: v_readfirstlane_b32 s10, v8
; GCN-NEXT: v_readfirstlane_b32 s15, v15
-; GCN-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GCN-NEXT: s_or_b64 s[6:7], s[18:19], s[16:17]
; GCN-NEXT: v_readfirstlane_b32 s14, v14
-; GCN-NEXT: s_cbranch_scc0 .LBB12_6
+; GCN-NEXT: s_cbranch_scc0 .LBB12_10
; GCN-NEXT: ; %bb.1:
; GCN-NEXT: s_ashr_i32 s6, s17, 31
; GCN-NEXT: s_add_u32 s20, s16, s6
@@ -5039,9 +5027,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s6, s17, s6
; GCN-NEXT: .LBB12_3:
; GCN-NEXT: s_or_b64 s[16:17], s[14:15], s[12:13]
-; GCN-NEXT: s_mov_b32 s16, 0
-; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
-; GCN-NEXT: s_cbranch_scc0 .LBB12_7
+; GCN-NEXT: s_cbranch_scc0 .LBB12_11
; GCN-NEXT: ; %bb.4:
; GCN-NEXT: s_ashr_i32 s16, s13, 31
; GCN-NEXT: s_add_u32 s18, s12, s16
@@ -5165,7 +5151,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_xor_b64 s[18:19], s[18:19], s[20:21]
; GCN-NEXT: s_sub_u32 s18, s18, s20
; GCN-NEXT: s_subb_u32 s19, s19, s20
-; GCN-NEXT: s_cbranch_execnz .LBB12_8
+; GCN-NEXT: s_cbranch_execnz .LBB12_12
; GCN-NEXT: .LBB12_5:
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12
; GCN-NEXT: s_sub_i32 s13, 0, s12
@@ -5185,22 +5171,35 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_subrev_u32_e32 v1, s12, v0
; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0
; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; GCN-NEXT: s_branch .LBB12_9
+; GCN-NEXT: s_or_b64 s[12:13], s[10:11], s[8:9]
+; GCN-NEXT: s_cbranch_scc1 .LBB12_13
; GCN-NEXT: .LBB12_6:
+; GCN-NEXT: ; implicit-def: $sgpr14_sgpr15
+; GCN-NEXT: s_branch .LBB12_14
+; GCN-NEXT: .LBB12_7:
+; GCN-NEXT: v_mov_b32_e32 v4, s14
+; GCN-NEXT: v_mov_b32_e32 v5, s15
+; GCN-NEXT: s_or_b64 s[8:9], s[4:5], s[2:3]
+; GCN-NEXT: s_cbranch_scc1 .LBB12_15
+; GCN-NEXT: .LBB12_8:
+; GCN-NEXT: ; implicit-def: $sgpr10_sgpr11
+; GCN-NEXT: s_branch .LBB12_16
+; GCN-NEXT: .LBB12_9:
+; GCN-NEXT: v_mov_b32_e32 v6, s10
+; GCN-NEXT: v_mov_b32_e32 v7, s11
+; GCN-NEXT: s_branch .LBB12_17
+; GCN-NEXT: .LBB12_10:
; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7
; GCN-NEXT: s_branch .LBB12_2
-; GCN-NEXT: .LBB12_7:
+; GCN-NEXT: .LBB12_11:
; GCN-NEXT: ; implicit-def: $sgpr18_sgpr19
; GCN-NEXT: s_branch .LBB12_5
-; GCN-NEXT: .LBB12_8:
+; GCN-NEXT: .LBB12_12:
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: v_mov_b32_e32 v3, s19
-; GCN-NEXT: .LBB12_9:
; GCN-NEXT: s_or_b64 s[12:13], s[10:11], s[8:9]
-; GCN-NEXT: s_mov_b32 s12, 0
-; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT: s_cbranch_scc0 .LBB12_12
-; GCN-NEXT: ; %bb.10:
+; GCN-NEXT: s_cbranch_scc0 .LBB12_6
+; GCN-NEXT: .LBB12_13:
; GCN-NEXT: s_ashr_i32 s12, s9, 31
; GCN-NEXT: s_add_u32 s14, s8, s12
; GCN-NEXT: s_mov_b32 s13, s12
@@ -5323,8 +5322,8 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_xor_b64 s[14:15], s[14:15], s[16:17]
; GCN-NEXT: s_sub_u32 s14, s14, s16
; GCN-NEXT: s_subb_u32 s15, s15, s16
-; GCN-NEXT: s_cbranch_execnz .LBB12_13
-; GCN-NEXT: .LBB12_11:
+; GCN-NEXT: s_cbranch_execnz .LBB12_7
+; GCN-NEXT: .LBB12_14:
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8
; GCN-NEXT: s_sub_i32 s9, 0, s8
; GCN-NEXT: v_mov_b32_e32 v5, 0
@@ -5343,19 +5342,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_subrev_u32_e32 v1, s8, v0
; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
-; GCN-NEXT: s_branch .LBB12_14
-; GCN-NEXT: .LBB12_12:
-; GCN-NEXT: ; implicit-def: $sgpr14_sgpr15
-; GCN-NEXT: s_branch .LBB12_11
-; GCN-NEXT: .LBB12_13:
-; GCN-NEXT: v_mov_b32_e32 v4, s14
-; GCN-NEXT: v_mov_b32_e32 v5, s15
-; GCN-NEXT: .LBB12_14:
; GCN-NEXT: s_or_b64 s[8:9], s[4:5], s[2:3]
-; GCN-NEXT: s_mov_b32 s8, 0
-; GCN-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GCN-NEXT: s_cbranch_scc0 .LBB12_17
-; GCN-NEXT: ; %bb.15:
+; GCN-NEXT: s_cbranch_scc0 .LBB12_8
+; GCN-NEXT: .LBB12_15:
; GCN-NEXT: s_ashr_i32 s8, s3, 31
; GCN-NEXT: s_add_u32 s10, s2, s8
; GCN-NEXT: s_mov_b32 s9, s8
@@ -5478,7 +5467,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13]
; GCN-NEXT: s_sub_u32 s10, s10, s12
; GCN-NEXT: s_subb_u32 s11, s11, s12
-; GCN-NEXT: s_cbranch_execnz .LBB12_18
+; GCN-NEXT: s_cbranch_execnz .LBB12_9
; GCN-NEXT: .LBB12_16:
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2
; GCN-NEXT: s_sub_i32 s3, 0, s2
@@ -5498,14 +5487,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_subrev_u32_e32 v1, s2, v0
; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v0
; GCN-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc
-; GCN-NEXT: s_branch .LBB12_19
; GCN-NEXT: .LBB12_17:
-; GCN-NEXT: ; implicit-def: $sgpr10_sgpr11
-; GCN-NEXT: s_branch .LBB12_16
-; GCN-NEXT: .LBB12_18:
-; GCN-NEXT: v_mov_b32_e32 v6, s10
-; GCN-NEXT: v_mov_b32_e32 v7, s11
-; GCN-NEXT: .LBB12_19:
; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
@@ -6119,23 +6101,23 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA: ; %bb.0:
; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_add_u32 s0, s6, 48
; TONGA-NEXT: v_mov_b32_e32 v0, s6
-; TONGA-NEXT: s_addc_u32 s1, s7, 0
+; TONGA-NEXT: s_add_u32 s0, s6, 48
; TONGA-NEXT: v_mov_b32_e32 v1, s7
-; TONGA-NEXT: s_add_u32 s2, s6, 32
+; TONGA-NEXT: s_addc_u32 s1, s7, 0
; TONGA-NEXT: flat_load_dwordx4 v[14:17], v[0:1]
-; TONGA-NEXT: s_addc_u32 s3, s7, 0
-; TONGA-NEXT: v_mov_b32_e32 v0, s2
-; TONGA-NEXT: v_mov_b32_e32 v1, s3
-; TONGA-NEXT: flat_load_dwordx4 v[10:13], v[0:1]
; TONGA-NEXT: v_mov_b32_e32 v0, s0
; TONGA-NEXT: v_mov_b32_e32 v1, s1
+; TONGA-NEXT: s_add_u32 s0, s6, 32
+; TONGA-NEXT: s_addc_u32 s1, s7, 0
+; TONGA-NEXT: v_mov_b32_e32 v3, s1
+; TONGA-NEXT: v_mov_b32_e32 v2, s0
; TONGA-NEXT: s_add_u32 s0, s6, 16
; TONGA-NEXT: s_addc_u32 s1, s7, 0
; TONGA-NEXT: v_mov_b32_e32 v5, s1
-; TONGA-NEXT: v_mov_b32_e32 v4, s0
+; TONGA-NEXT: flat_load_dwordx4 v[10:13], v[2:3]
; TONGA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; TONGA-NEXT: v_mov_b32_e32 v4, s0
; TONGA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; TONGA-NEXT: s_waitcnt vmcnt(3)
; TONGA-NEXT: v_readfirstlane_b32 s3, v15
@@ -6144,8 +6126,6 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_readfirstlane_b32 s1, v11
; TONGA-NEXT: v_readfirstlane_b32 s0, v10
; TONGA-NEXT: s_or_b64 s[6:7], s[2:3], s[0:1]
-; TONGA-NEXT: s_mov_b32 s6, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[6:7], 0
; TONGA-NEXT: s_cbranch_scc0 .LBB12_3
; TONGA-NEXT: ; %bb.1:
; TONGA-NEXT: s_ashr_i32 s6, s1, 31
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 28c6b40554bb6..51aa8706abac2 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -731,12 +731,11 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1032-LABEL: test_udiv64:
; GFX1032: ; %bb.0: ; %bb
; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX1032-NEXT: s_mov_b32 s8, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_or_b64 s[4:5], s[2:3], s[0:1]
-; GFX1032-NEXT: s_mov_b32 s4, 0
-; GFX1032-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX1032-NEXT: s_cbranch_scc0 .LBB15_4
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, s0
@@ -751,71 +750,71 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX1032-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1032-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s5, v1
-; GFX1032-NEXT: v_readfirstlane_b32 s8, v0
-; GFX1032-NEXT: s_mul_i32 s11, s9, s5
-; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s8
-; GFX1032-NEXT: s_mul_i32 s12, s10, s8
+; GFX1032-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1032-NEXT: v_readfirstlane_b32 s5, v0
+; GFX1032-NEXT: s_mul_i32 s11, s9, s4
+; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s5
+; GFX1032-NEXT: s_mul_i32 s12, s10, s5
; GFX1032-NEXT: s_add_i32 s11, s13, s11
-; GFX1032-NEXT: s_mul_i32 s14, s9, s8
+; GFX1032-NEXT: s_mul_i32 s14, s9, s5
; GFX1032-NEXT: s_add_i32 s11, s11, s12
-; GFX1032-NEXT: s_mul_hi_u32 s13, s8, s14
-; GFX1032-NEXT: s_mul_i32 s16, s8, s11
-; GFX1032-NEXT: s_mul_hi_u32 s15, s5, s14
-; GFX1032-NEXT: s_mul_i32 s12, s5, s14
-; GFX1032-NEXT: s_mul_hi_u32 s14, s8, s11
+; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s14
+; GFX1032-NEXT: s_mul_i32 s16, s5, s11
+; GFX1032-NEXT: s_mul_hi_u32 s15, s4, s14
+; GFX1032-NEXT: s_mul_i32 s12, s4, s14
+; GFX1032-NEXT: s_mul_hi_u32 s14, s5, s11
; GFX1032-NEXT: s_add_u32 s13, s13, s16
; GFX1032-NEXT: s_addc_u32 s14, 0, s14
-; GFX1032-NEXT: s_mul_hi_u32 s17, s5, s11
+; GFX1032-NEXT: s_mul_hi_u32 s17, s4, s11
; GFX1032-NEXT: s_add_u32 s12, s13, s12
-; GFX1032-NEXT: s_mul_i32 s11, s5, s11
+; GFX1032-NEXT: s_mul_i32 s11, s4, s11
; GFX1032-NEXT: s_addc_u32 s12, s14, s15
; GFX1032-NEXT: s_addc_u32 s13, s17, 0
; GFX1032-NEXT: s_add_u32 s11, s12, s11
; GFX1032-NEXT: s_addc_u32 s12, 0, s13
-; GFX1032-NEXT: s_add_u32 s8, s8, s11
-; GFX1032-NEXT: s_addc_u32 s5, s5, s12
-; GFX1032-NEXT: s_mul_hi_u32 s11, s9, s8
-; GFX1032-NEXT: s_mul_i32 s12, s9, s8
-; GFX1032-NEXT: s_mul_i32 s9, s9, s5
-; GFX1032-NEXT: s_mul_i32 s10, s10, s8
+; GFX1032-NEXT: s_add_u32 s5, s5, s11
+; GFX1032-NEXT: s_addc_u32 s4, s4, s12
+; GFX1032-NEXT: s_mul_hi_u32 s11, s9, s5
+; GFX1032-NEXT: s_mul_i32 s12, s9, s5
+; GFX1032-NEXT: s_mul_i32 s9, s9, s4
+; GFX1032-NEXT: s_mul_i32 s10, s10, s5
; GFX1032-NEXT: s_add_i32 s9, s11, s9
-; GFX1032-NEXT: s_mul_i32 s11, s5, s12
+; GFX1032-NEXT: s_mul_i32 s11, s4, s12
; GFX1032-NEXT: s_add_i32 s9, s9, s10
-; GFX1032-NEXT: s_mul_hi_u32 s10, s8, s12
-; GFX1032-NEXT: s_mul_i32 s15, s8, s9
-; GFX1032-NEXT: s_mul_hi_u32 s14, s8, s9
+; GFX1032-NEXT: s_mul_hi_u32 s10, s5, s12
+; GFX1032-NEXT: s_mul_i32 s15, s5, s9
+; GFX1032-NEXT: s_mul_hi_u32 s14, s5, s9
; GFX1032-NEXT: s_add_u32 s10, s10, s15
-; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s12
+; GFX1032-NEXT: s_mul_hi_u32 s13, s4, s12
; GFX1032-NEXT: s_addc_u32 s14, 0, s14
-; GFX1032-NEXT: s_mul_hi_u32 s12, s5, s9
+; GFX1032-NEXT: s_mul_hi_u32 s12, s4, s9
; GFX1032-NEXT: s_add_u32 s10, s10, s11
-; GFX1032-NEXT: s_mul_i32 s9, s5, s9
+; GFX1032-NEXT: s_mul_i32 s9, s4, s9
; GFX1032-NEXT: s_addc_u32 s10, s14, s13
; GFX1032-NEXT: s_addc_u32 s11, s12, 0
; GFX1032-NEXT: s_add_u32 s9, s10, s9
; GFX1032-NEXT: s_addc_u32 s10, 0, s11
-; GFX1032-NEXT: s_add_u32 s8, s8, s9
-; GFX1032-NEXT: s_addc_u32 s5, s5, s10
-; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s8
-; GFX1032-NEXT: s_mul_i32 s12, s2, s5
-; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s5
-; GFX1032-NEXT: s_mul_hi_u32 s10, s3, s8
-; GFX1032-NEXT: s_mul_i32 s8, s3, s8
+; GFX1032-NEXT: s_add_u32 s5, s5, s9
+; GFX1032-NEXT: s_addc_u32 s4, s4, s10
+; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s5
+; GFX1032-NEXT: s_mul_i32 s12, s2, s4
+; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s4
+; GFX1032-NEXT: s_mul_hi_u32 s10, s3, s5
+; GFX1032-NEXT: s_mul_i32 s5, s3, s5
; GFX1032-NEXT: s_add_u32 s9, s9, s12
; GFX1032-NEXT: s_addc_u32 s11, 0, s11
-; GFX1032-NEXT: s_mul_hi_u32 s13, s3, s5
-; GFX1032-NEXT: s_add_u32 s8, s9, s8
-; GFX1032-NEXT: s_mul_i32 s5, s3, s5
-; GFX1032-NEXT: s_addc_u32 s8, s11, s10
+; GFX1032-NEXT: s_mul_hi_u32 s13, s3, s4
+; GFX1032-NEXT: s_add_u32 s5, s9, s5
+; GFX1032-NEXT: s_mul_i32 s4, s3, s4
+; GFX1032-NEXT: s_addc_u32 s5, s11, s10
; GFX1032-NEXT: s_addc_u32 s9, s13, 0
-; GFX1032-NEXT: s_add_u32 s5, s8, s5
-; GFX1032-NEXT: s_addc_u32 s8, 0, s9
-; GFX1032-NEXT: s_mul_hi_u32 s9, s0, s5
-; GFX1032-NEXT: s_mul_i32 s10, s0, s8
-; GFX1032-NEXT: s_mul_i32 s11, s1, s5
-; GFX1032-NEXT: s_add_i32 s9, s9, s10
+; GFX1032-NEXT: s_add_u32 s4, s5, s4
+; GFX1032-NEXT: s_addc_u32 s5, 0, s9
+; GFX1032-NEXT: s_mul_hi_u32 s9, s0, s4
; GFX1032-NEXT: s_mul_i32 s10, s0, s5
+; GFX1032-NEXT: s_mul_i32 s11, s1, s4
+; GFX1032-NEXT: s_add_i32 s9, s9, s10
+; GFX1032-NEXT: s_mul_i32 s10, s0, s4
; GFX1032-NEXT: s_add_i32 s9, s9, s11
; GFX1032-NEXT: s_sub_i32 s11, s3, s9
; GFX1032-NEXT: s_sub_u32 s10, s2, s10
@@ -829,10 +828,10 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: s_cselect_b32 s13, -1, 0
; GFX1032-NEXT: s_cmp_eq_u32 s11, s1
; GFX1032-NEXT: s_cselect_b32 s11, s13, s14
-; GFX1032-NEXT: s_add_u32 s13, s5, 1
-; GFX1032-NEXT: s_addc_u32 s14, s8, 0
-; GFX1032-NEXT: s_add_u32 s15, s5, 2
-; GFX1032-NEXT: s_addc_u32 s16, s8, 0
+; GFX1032-NEXT: s_add_u32 s13, s4, 1
+; GFX1032-NEXT: s_addc_u32 s14, s5, 0
+; GFX1032-NEXT: s_add_u32 s15, s4, 2
+; GFX1032-NEXT: s_addc_u32 s16, s5, 0
; GFX1032-NEXT: s_cmp_lg_u32 s11, 0
; GFX1032-NEXT: s_cselect_b32 s11, s15, s13
; GFX1032-NEXT: s_cselect_b32 s13, s16, s14
@@ -845,14 +844,14 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: s_cmp_eq_u32 s3, s1
; GFX1032-NEXT: s_cselect_b32 s1, s10, s9
; GFX1032-NEXT: s_cmp_lg_u32 s1, 0
-; GFX1032-NEXT: s_cselect_b32 s9, s13, s8
-; GFX1032-NEXT: s_cselect_b32 s8, s11, s5
-; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4
+; GFX1032-NEXT: s_cselect_b32 s5, s13, s5
+; GFX1032-NEXT: s_cselect_b32 s4, s11, s4
+; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8
; GFX1032-NEXT: s_cbranch_vccnz .LBB15_3
; GFX1032-NEXT: .LBB15_2:
; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, s0
; GFX1032-NEXT: s_sub_i32 s3, 0, s0
-; GFX1032-NEXT: s_mov_b32 s9, 0
+; GFX1032-NEXT: s_mov_b32 s5, 0
; GFX1032-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX1032-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX1032-NEXT: v_cvt_u32_f32_e32 v0, v0
@@ -870,15 +869,15 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: s_cselect_b32 s2, s4, s2
; GFX1032-NEXT: s_add_i32 s3, s1, 1
; GFX1032-NEXT: s_cmp_ge_u32 s2, s0
-; GFX1032-NEXT: s_cselect_b32 s8, s3, s1
+; GFX1032-NEXT: s_cselect_b32 s4, s3, s1
; GFX1032-NEXT: .LBB15_3:
-; GFX1032-NEXT: v_mov_b32_e32 v0, s8
+; GFX1032-NEXT: v_mov_b32_e32 v0, s4
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: v_mov_b32_e32 v1, s9
+; GFX1032-NEXT: v_mov_b32_e32 v1, s5
; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] offset:16
; GFX1032-NEXT: s_endpgm
; GFX1032-NEXT: .LBB15_4:
-; GFX1032-NEXT: ; implicit-def: $sgpr8_sgpr9
+; GFX1032-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX1032-NEXT: s_branch .LBB15_2
;
; GFX1064-LABEL: test_udiv64:
@@ -888,8 +887,6 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_or_b64 s[4:5], s[2:3], s[0:1]
-; GFX1064-NEXT: s_mov_b32 s4, 0
-; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX1064-NEXT: s_cbranch_scc0 .LBB15_4
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0
More information about the llvm-commits
mailing list