[llvm] [AMDGPU] Analyze REG_SEQUENCE To Remove Redundant CMP Instructions (PR #167364)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 10 10:33:03 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Patrick Simmons (linuxrocks123)
<details>
<summary>Changes</summary>
This PR adds analysis of REG_SEQUENCE instructions to the AMDGPU TII so as to allow the peephole optimizer to remove redundant CMP instructions even if the REG_SEQUENCE pseudo is inbetween the def and use of SCC.
---
Patch is 48.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167364.diff
7 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+27)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+2)
- (modified) llvm/test/CodeGen/AMDGPU/carryout-selection.ll (+168-178)
- (added) llvm/test/CodeGen/AMDGPU/redundant-cmp-reg-sequence.ll (+22)
- (modified) llvm/test/CodeGen/AMDGPU/s_cmp_0.ll (-4)
- (modified) llvm/test/CodeGen/AMDGPU/srem.ll (+39-59)
- (modified) llvm/test/CodeGen/AMDGPU/wave32.ll (+56-59)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..951df6899f5a1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1312,6 +1312,30 @@ Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,
return Reg;
}
+MachineInstr *
+SIInstrInfo::pierceThroughRegSequence(const MachineInstr &MI) const {
+ if (MI.getOpcode() != AMDGPU::REG_SEQUENCE)
+ return nullptr;
+
+ const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ int64_t SubRegValues[2];
+ bool SubRegIsConst[2];
+ MachineInstr *RealDefs[2];
+ for (unsigned I : {2, 4}) {
+ unsigned ArrayIdx = MI.getOperand(I).getImm() == AMDGPU::sub0 ? 0 : 1;
+ Register Subreg = MI.getOperand(I - 1).getReg();
+ RealDefs[ArrayIdx] = MRI.getUniqueVRegDef(Subreg);
+ SubRegIsConst[ArrayIdx] = getConstValDefinedInReg(
+ *RealDefs[ArrayIdx], Subreg, SubRegValues[ArrayIdx]);
+ }
+
+ for (unsigned I : {0, 1})
+ if (SubRegIsConst[I] && !SubRegValues[I])
+ return RealDefs[(I + 1) % 2];
+
+ return nullptr;
+}
+
bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
const Register Reg,
int64_t &ImmVal) const {
@@ -10676,6 +10700,9 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (!Def || Def->getParent() != CmpInstr.getParent())
return false;
+ if (MachineInstr *RegSequenceDef = pierceThroughRegSequence(*Def))
+ Def = RegSequenceDef;
+
// For S_OP that set SCC = DST!=0, do the transformation
//
// s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 0643b532ea04c..d7d049f722b47 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -714,6 +714,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
}
}
+ MachineInstr *pierceThroughRegSequence(const MachineInstr &MI) const;
+
static bool setsSCCifResultIsNonZero(const MachineInstr &MI) {
switch (MI.getOpcode()) {
case AMDGPU::S_ABSDIFF_I32:
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index b96de173dc8c6..e43967626c764 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -2120,8 +2120,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5]
-; VI-NEXT: s_mov_b32 s6, 0
-; VI-NEXT: s_cmp_lg_u64 s[6:7], 0
; VI-NEXT: s_cbranch_scc0 .LBB16_3
; VI-NEXT: ; %bb.1:
; VI-NEXT: v_cvt_f32_u32_e32 v0, s4
@@ -2272,8 +2270,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7]
-; GFX9-NEXT: s_mov_b32 s4, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-NEXT: s_cbranch_scc0 .LBB16_4
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
@@ -2422,10 +2418,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: s_clause 0x1
; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1010-NEXT: s_mov_b32 s8, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7]
-; GFX1010-NEXT: s_mov_b32 s4, 0
-; GFX1010-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX1010-NEXT: s_cbranch_scc0 .LBB16_4
; GFX1010-NEXT: ; %bb.1:
; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s6
@@ -2440,71 +2435,71 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX1010-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX1010-NEXT: v_readfirstlane_b32 s5, v1
-; GFX1010-NEXT: v_readfirstlane_b32 s8, v0
-; GFX1010-NEXT: s_mul_i32 s11, s9, s5
-; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s8
-; GFX1010-NEXT: s_mul_i32 s12, s10, s8
+; GFX1010-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1010-NEXT: v_readfirstlane_b32 s5, v0
+; GFX1010-NEXT: s_mul_i32 s11, s9, s4
+; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s5
+; GFX1010-NEXT: s_mul_i32 s12, s10, s5
; GFX1010-NEXT: s_add_i32 s11, s13, s11
-; GFX1010-NEXT: s_mul_i32 s14, s9, s8
+; GFX1010-NEXT: s_mul_i32 s14, s9, s5
; GFX1010-NEXT: s_add_i32 s11, s11, s12
-; GFX1010-NEXT: s_mul_hi_u32 s13, s8, s14
-; GFX1010-NEXT: s_mul_i32 s16, s8, s11
-; GFX1010-NEXT: s_mul_hi_u32 s15, s5, s14
-; GFX1010-NEXT: s_mul_i32 s12, s5, s14
-; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s11
+; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s14
+; GFX1010-NEXT: s_mul_i32 s16, s5, s11
+; GFX1010-NEXT: s_mul_hi_u32 s15, s4, s14
+; GFX1010-NEXT: s_mul_i32 s12, s4, s14
+; GFX1010-NEXT: s_mul_hi_u32 s14, s5, s11
; GFX1010-NEXT: s_add_u32 s13, s13, s16
; GFX1010-NEXT: s_addc_u32 s14, 0, s14
-; GFX1010-NEXT: s_mul_hi_u32 s17, s5, s11
+; GFX1010-NEXT: s_mul_hi_u32 s17, s4, s11
; GFX1010-NEXT: s_add_u32 s12, s13, s12
-; GFX1010-NEXT: s_mul_i32 s11, s5, s11
+; GFX1010-NEXT: s_mul_i32 s11, s4, s11
; GFX1010-NEXT: s_addc_u32 s12, s14, s15
; GFX1010-NEXT: s_addc_u32 s13, s17, 0
; GFX1010-NEXT: s_add_u32 s11, s12, s11
; GFX1010-NEXT: s_addc_u32 s12, 0, s13
-; GFX1010-NEXT: s_add_u32 s8, s8, s11
-; GFX1010-NEXT: s_addc_u32 s5, s5, s12
-; GFX1010-NEXT: s_mul_hi_u32 s11, s9, s8
-; GFX1010-NEXT: s_mul_i32 s12, s9, s8
-; GFX1010-NEXT: s_mul_i32 s9, s9, s5
-; GFX1010-NEXT: s_mul_i32 s10, s10, s8
+; GFX1010-NEXT: s_add_u32 s5, s5, s11
+; GFX1010-NEXT: s_addc_u32 s4, s4, s12
+; GFX1010-NEXT: s_mul_hi_u32 s11, s9, s5
+; GFX1010-NEXT: s_mul_i32 s12, s9, s5
+; GFX1010-NEXT: s_mul_i32 s9, s9, s4
+; GFX1010-NEXT: s_mul_i32 s10, s10, s5
; GFX1010-NEXT: s_add_i32 s9, s11, s9
-; GFX1010-NEXT: s_mul_i32 s11, s5, s12
+; GFX1010-NEXT: s_mul_i32 s11, s4, s12
; GFX1010-NEXT: s_add_i32 s9, s9, s10
-; GFX1010-NEXT: s_mul_hi_u32 s10, s8, s12
-; GFX1010-NEXT: s_mul_i32 s15, s8, s9
-; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s9
+; GFX1010-NEXT: s_mul_hi_u32 s10, s5, s12
+; GFX1010-NEXT: s_mul_i32 s15, s5, s9
+; GFX1010-NEXT: s_mul_hi_u32 s14, s5, s9
; GFX1010-NEXT: s_add_u32 s10, s10, s15
-; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s12
+; GFX1010-NEXT: s_mul_hi_u32 s13, s4, s12
; GFX1010-NEXT: s_addc_u32 s14, 0, s14
-; GFX1010-NEXT: s_mul_hi_u32 s12, s5, s9
+; GFX1010-NEXT: s_mul_hi_u32 s12, s4, s9
; GFX1010-NEXT: s_add_u32 s10, s10, s11
-; GFX1010-NEXT: s_mul_i32 s9, s5, s9
+; GFX1010-NEXT: s_mul_i32 s9, s4, s9
; GFX1010-NEXT: s_addc_u32 s10, s14, s13
; GFX1010-NEXT: s_addc_u32 s11, s12, 0
; GFX1010-NEXT: s_add_u32 s9, s10, s9
; GFX1010-NEXT: s_addc_u32 s10, 0, s11
-; GFX1010-NEXT: s_add_u32 s8, s8, s9
-; GFX1010-NEXT: s_addc_u32 s5, s5, s10
-; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s8
-; GFX1010-NEXT: s_mul_i32 s12, s2, s5
-; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s5
-; GFX1010-NEXT: s_mul_hi_u32 s10, s3, s8
-; GFX1010-NEXT: s_mul_i32 s8, s3, s8
+; GFX1010-NEXT: s_add_u32 s5, s5, s9
+; GFX1010-NEXT: s_addc_u32 s4, s4, s10
+; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s5
+; GFX1010-NEXT: s_mul_i32 s12, s2, s4
+; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s4
+; GFX1010-NEXT: s_mul_hi_u32 s10, s3, s5
+; GFX1010-NEXT: s_mul_i32 s5, s3, s5
; GFX1010-NEXT: s_add_u32 s9, s9, s12
; GFX1010-NEXT: s_addc_u32 s11, 0, s11
-; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s5
-; GFX1010-NEXT: s_add_u32 s8, s9, s8
-; GFX1010-NEXT: s_mul_i32 s5, s3, s5
-; GFX1010-NEXT: s_addc_u32 s8, s11, s10
+; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s4
+; GFX1010-NEXT: s_add_u32 s5, s9, s5
+; GFX1010-NEXT: s_mul_i32 s4, s3, s4
+; GFX1010-NEXT: s_addc_u32 s5, s11, s10
; GFX1010-NEXT: s_addc_u32 s9, s13, 0
-; GFX1010-NEXT: s_add_u32 s5, s8, s5
-; GFX1010-NEXT: s_addc_u32 s8, 0, s9
-; GFX1010-NEXT: s_mul_hi_u32 s9, s6, s5
-; GFX1010-NEXT: s_mul_i32 s10, s6, s8
-; GFX1010-NEXT: s_mul_i32 s11, s7, s5
-; GFX1010-NEXT: s_add_i32 s9, s9, s10
+; GFX1010-NEXT: s_add_u32 s4, s5, s4
+; GFX1010-NEXT: s_addc_u32 s5, 0, s9
+; GFX1010-NEXT: s_mul_hi_u32 s9, s6, s4
; GFX1010-NEXT: s_mul_i32 s10, s6, s5
+; GFX1010-NEXT: s_mul_i32 s11, s7, s4
+; GFX1010-NEXT: s_add_i32 s9, s9, s10
+; GFX1010-NEXT: s_mul_i32 s10, s6, s4
; GFX1010-NEXT: s_add_i32 s9, s9, s11
; GFX1010-NEXT: s_sub_i32 s11, s3, s9
; GFX1010-NEXT: s_sub_u32 s10, s2, s10
@@ -2518,10 +2513,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: s_cselect_b32 s13, -1, 0
; GFX1010-NEXT: s_cmp_eq_u32 s11, s7
; GFX1010-NEXT: s_cselect_b32 s11, s13, s14
-; GFX1010-NEXT: s_add_u32 s13, s5, 1
-; GFX1010-NEXT: s_addc_u32 s14, s8, 0
-; GFX1010-NEXT: s_add_u32 s15, s5, 2
-; GFX1010-NEXT: s_addc_u32 s16, s8, 0
+; GFX1010-NEXT: s_add_u32 s13, s4, 1
+; GFX1010-NEXT: s_addc_u32 s14, s5, 0
+; GFX1010-NEXT: s_add_u32 s15, s4, 2
+; GFX1010-NEXT: s_addc_u32 s16, s5, 0
; GFX1010-NEXT: s_cmp_lg_u32 s11, 0
; GFX1010-NEXT: s_cselect_b32 s11, s15, s13
; GFX1010-NEXT: s_cselect_b32 s13, s16, s14
@@ -2534,14 +2529,13 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: s_cmp_eq_u32 s3, s7
; GFX1010-NEXT: s_cselect_b32 s3, s10, s9
; GFX1010-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1010-NEXT: s_cselect_b32 s9, s13, s8
-; GFX1010-NEXT: s_cselect_b32 s8, s11, s5
-; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4
+; GFX1010-NEXT: s_cselect_b32 s5, s13, s5
+; GFX1010-NEXT: s_cselect_b32 s4, s11, s4
+; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8
; GFX1010-NEXT: s_cbranch_vccnz .LBB16_3
; GFX1010-NEXT: .LBB16_2:
; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX1010-NEXT: s_sub_i32 s4, 0, s6
-; GFX1010-NEXT: s_mov_b32 s9, 0
; GFX1010-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX1010-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0
@@ -2559,15 +2553,16 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: s_cselect_b32 s2, s5, s2
; GFX1010-NEXT: s_add_i32 s4, s3, 1
; GFX1010-NEXT: s_cmp_ge_u32 s2, s6
-; GFX1010-NEXT: s_cselect_b32 s8, s4, s3
+; GFX1010-NEXT: s_mov_b32 s5, 0
+; GFX1010-NEXT: s_cselect_b32 s4, s4, s3
; GFX1010-NEXT: .LBB16_3:
-; GFX1010-NEXT: v_mov_b32_e32 v0, s8
+; GFX1010-NEXT: v_mov_b32_e32 v0, s4
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
-; GFX1010-NEXT: v_mov_b32_e32 v1, s9
+; GFX1010-NEXT: v_mov_b32_e32 v1, s5
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: s_endpgm
; GFX1010-NEXT: .LBB16_4:
-; GFX1010-NEXT: ; implicit-def: $sgpr8_sgpr9
+; GFX1010-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX1010-NEXT: s_branch .LBB16_2
;
; GFX1030W32-LABEL: sudiv64:
@@ -2575,10 +2570,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: s_clause 0x1
; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX1030W32-NEXT: s_mov_b32 s8, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W32-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5]
-; GFX1030W32-NEXT: s_mov_b32 s6, 0
-; GFX1030W32-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX1030W32-NEXT: s_cbranch_scc0 .LBB16_4
; GFX1030W32-NEXT: ; %bb.1:
; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v0, s4
@@ -2593,71 +2587,71 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0
; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX1030W32-NEXT: v_readfirstlane_b32 s7, v1
-; GFX1030W32-NEXT: v_readfirstlane_b32 s8, v0
-; GFX1030W32-NEXT: s_mul_i32 s11, s9, s7
-; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s8
-; GFX1030W32-NEXT: s_mul_i32 s12, s10, s8
+; GFX1030W32-NEXT: v_readfirstlane_b32 s6, v1
+; GFX1030W32-NEXT: v_readfirstlane_b32 s7, v0
+; GFX1030W32-NEXT: s_mul_i32 s11, s9, s6
+; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s7
+; GFX1030W32-NEXT: s_mul_i32 s12, s10, s7
; GFX1030W32-NEXT: s_add_i32 s11, s13, s11
-; GFX1030W32-NEXT: s_mul_i32 s14, s9, s8
+; GFX1030W32-NEXT: s_mul_i32 s14, s9, s7
; GFX1030W32-NEXT: s_add_i32 s11, s11, s12
-; GFX1030W32-NEXT: s_mul_hi_u32 s13, s8, s14
-; GFX1030W32-NEXT: s_mul_i32 s16, s8, s11
-; GFX1030W32-NEXT: s_mul_hi_u32 s15, s7, s14
-; GFX1030W32-NEXT: s_mul_i32 s12, s7, s14
-; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s11
+; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s14
+; GFX1030W32-NEXT: s_mul_i32 s16, s7, s11
+; GFX1030W32-NEXT: s_mul_hi_u32 s15, s6, s14
+; GFX1030W32-NEXT: s_mul_i32 s12, s6, s14
+; GFX1030W32-NEXT: s_mul_hi_u32 s14, s7, s11
; GFX1030W32-NEXT: s_add_u32 s13, s13, s16
; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14
-; GFX1030W32-NEXT: s_mul_hi_u32 s17, s7, s11
+; GFX1030W32-NEXT: s_mul_hi_u32 s17, s6, s11
; GFX1030W32-NEXT: s_add_u32 s12, s13, s12
-; GFX1030W32-NEXT: s_mul_i32 s11, s7, s11
+; GFX1030W32-NEXT: s_mul_i32 s11, s6, s11
; GFX1030W32-NEXT: s_addc_u32 s12, s14, s15
; GFX1030W32-NEXT: s_addc_u32 s13, s17, 0
; GFX1030W32-NEXT: s_add_u32 s11, s12, s11
; GFX1030W32-NEXT: s_addc_u32 s12, 0, s13
-; GFX1030W32-NEXT: s_add_u32 s8, s8, s11
-; GFX1030W32-NEXT: s_addc_u32 s7, s7, s12
-; GFX1030W32-NEXT: s_mul_hi_u32 s11, s9, s8
-; GFX1030W32-NEXT: s_mul_i32 s12, s9, s8
-; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7
-; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8
+; GFX1030W32-NEXT: s_add_u32 s7, s7, s11
+; GFX1030W32-NEXT: s_addc_u32 s6, s6, s12
+; GFX1030W32-NEXT: s_mul_hi_u32 s11, s9, s7
+; GFX1030W32-NEXT: s_mul_i32 s12, s9, s7
+; GFX1030W32-NEXT: s_mul_i32 s9, s9, s6
+; GFX1030W32-NEXT: s_mul_i32 s10, s10, s7
; GFX1030W32-NEXT: s_add_i32 s9, s11, s9
-; GFX1030W32-NEXT: s_mul_i32 s11, s7, s12
+; GFX1030W32-NEXT: s_mul_i32 s11, s6, s12
; GFX1030W32-NEXT: s_add_i32 s9, s9, s10
-; GFX1030W32-NEXT: s_mul_hi_u32 s10, s8, s12
-; GFX1030W32-NEXT: s_mul_i32 s15, s8, s9
-; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s9
+; GFX1030W32-NEXT: s_mul_hi_u32 s10, s7, s12
+; GFX1030W32-NEXT: s_mul_i32 s15, s7, s9
+; GFX1030W32-NEXT: s_mul_hi_u32 s14, s7, s9
; GFX1030W32-NEXT: s_add_u32 s10, s10, s15
-; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s12
+; GFX1030W32-NEXT: s_mul_hi_u32 s13, s6, s12
; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14
-; GFX1030W32-NEXT: s_mul_hi_u32 s12, s7, s9
+; GFX1030W32-NEXT: s_mul_hi_u32 s12, s6, s9
; GFX1030W32-NEXT: s_add_u32 s10, s10, s11
-; GFX1030W32-NEXT: s_mul_i32 s9, s7, s9
+; GFX1030W32-NEXT: s_mul_i32 s9, s6, s9
; GFX1030W32-NEXT: s_addc_u32 s10, s14, s13
; GFX1030W32-NEXT: s_addc_u32 s11, s12, 0
; GFX1030W32-NEXT: s_add_u32 s9, s10, s9
; GFX1030W32-NEXT: s_addc_u32 s10, 0, s11
-; GFX1030W32-NEXT: s_add_u32 s8, s8, s9
-; GFX1030W32-NEXT: s_addc_u32 s7, s7, s10
-; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s8
-; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7
-; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s7
-; GFX1030W32-NEXT: s_mul_hi_u32 s10, s3, s8
-; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8
+; GFX1030W32-NEXT: s_add_u32 s7, s7, s9
+; GFX1030W32-NEXT: s_addc_u32 s6, s6, s10
+; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s7
+; GFX1030W32-NEXT: s_mul_i32 s12, s2, s6
+; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s6
+; GFX1030W32-NEXT: s_mul_hi_u32 s10, s3, s7
+; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7
; GFX1030W32-NEXT: s_add_u32 s9, s9, s12
; GFX1030W32-NEXT: s_addc_u32 s11, 0, s11
-; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s7
-; GFX1030W32-NEXT: s_add_u32 s8, s9, s8
-; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7
-; GFX1030W32-NEXT: s_addc_u32 s8, s11, s10
+; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s6
+; GFX1030W32-NEXT: s_add_u32 s7, s9, s7
+; GFX1030W32-NEXT: s_mul_i32 s6, s3, s6
+; GFX1030W32-NEXT: s_addc_u32 s7, s11, s10
; GFX1030W32-NEXT: s_addc_u32 s9, s13, 0
-; GFX1030W32-NEXT: s_add_u32 s7, s8, s7
-; GFX1030W32-NEXT: s_addc_u32 s8, 0, s9
-; GFX1030W32-NEXT: s_mul_hi_u32 s9, s4, s7
-; GFX1030W32-NEXT: s_mul_i32 s10, s4, s8
-; GFX1030W32-NEXT: s_mul_i32 s11, s5, s7
-; GFX1030W32-NEXT: s_add_i32 s9, s9, s10
+; GFX1030W32-NEXT: s_add_u32 s6, s7, s6
+; GFX1030W32-NEXT: s_addc_u32 s7, 0, s9
+; GFX1030W32-NEXT: s_mul_hi_u32 s9, s4, s6
; GFX1030W32-NEXT: s_mul_i32 s10, s4, s7
+; GFX1030W32-NEXT: s_mul_i32 s11, s5, s6
+; GFX1030W32-NEXT: s_add_i32 s9, s9, s10
+; GFX1030W32-NEXT: s_mul_i32 s10, s4, s6
; GFX1030W32-NEXT: s_add_i32 s9, s9, s11
; GFX1030W32-NEXT: s_sub_i32 s11, s3, s9
; GFX1030W32-NEXT: s_sub_u32 s10, s2, s10
@@ -2671,10 +2665,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: s_cselect_b32 s13, -1, 0
; GFX1030W32-NEXT: s_cmp_eq_u32 s11, s5
; GFX1030W32-NEXT: s_cselect_b32 s11, s13, s14
-; GFX1030W32-NEXT: s_add_u32 s13, s7, 1
-; GFX1030W32-NEXT: s_addc_u32 s14, s8, 0
-; GFX1030W32-NEXT: s_add_u32 s15, s7, 2
-; GFX1030W32-NEXT: s_addc_u32 s16, s8, 0
+; GFX1030W32-NEXT: s_add_u32 s13, s6, 1
+; GFX1030W32-NEXT: s_addc_u32 s14, s7, 0
+; GFX1030W32-NEXT: s_add_u32 s15, s6, 2
+; GFX1030W32-NEXT: s_addc_u32 s16, s7, 0
; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0
; GFX1030W32-NEXT: s_cselect_b32 s11, s15, s13
; GFX1030W32-NEXT: s_cselect_b32 s13, s16, s14
@@ -2687,14 +2681,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: s_cmp_eq_u32 s3, s5
; GFX1030W32-NEXT: s_cselect_b32 s3, s10, s9
; GFX1030W32-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1030W32-NEXT: s_cselect_b32 s9, s13, s8
-; GFX1030W32-NEXT: s_cselect_b32 s8, s11, s7
-; GFX1030W32-NEXT: s_andn2_b32 vcc_lo, exec_lo, s6
+; GFX1030W32-NEXT: s_cselect_b32 s7, s13, s7
+; GFX1030W32-NEXT: s_cselect_b32 s6, s11, s6
+; GFX1030W32-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8
; GFX1030W32-NEXT: s_cbranch_vccnz .LBB16_3
; GFX1030W32-NEXT: .LBB16_2:
; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v0, s4
; GFX1030W32-NEXT: s_sub_i32 s5, 0, s4
-; GFX1030W32-NEXT: s_mov_b32 s9, 0
+; GFX1030W32-NEXT: s_mov_b32 s7, 0
; GFX1030W32-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX1030W32-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v0, v0
@@ -2712,15 +2706,15 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: s_cselect_b32 s2, s6, s2
; GFX1030W32-NEXT: s_add_i32 s5, s3, 1
; GFX1030W32-NEXT: s_cmp_ge_u32 s2, s4
-; GFX1030W32-NEXT: s_cselect_b32 s8, s5, s3
+; GFX1030W32-NEXT: s_cselect_b32 s6, s5, s3
; GFX1030W32-NEXT: .LBB16_3:
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s8
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s6
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s9
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s7
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W32-NEXT: s_endpgm
; GFX1030W32-NEXT: .LBB16_4:
-; GFX1030W32-NEXT: ; implicit-def: $sgpr8_sgpr9
+; GFX1030W32-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX1030W32-NEXT: s_branch .LBB16_2
;
; GFX1030W64-LABEL: sudiv64:
@@ -2730,8 +2724,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W64-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5]
-; GFX1030W64-NEXT: s_mov_b32 s6, 0
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX1030W64-NEXT: s_cbranch_scc0 .LBB16_4
; GFX1030W64-NEXT: ; %bb.1:
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4
@@ -2880,11 +2872,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %ou...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/167364
More information about the llvm-commits
mailing list