[llvm] [AMDGPU] Analyze REG_SEQUENCE To Remove Redundant CMP Instructions (PR #167364)

via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 10 10:33:03 PST 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Patrick Simmons (linuxrocks123)

<details>
<summary>Changes</summary>

This PR adds analysis of REG_SEQUENCE instructions to the AMDGPU TII so as to allow the peephole optimizer to remove redundant CMP instructions even if the REG_SEQUENCE pseudo is inbetween the def and use of SCC.


---

Patch is 48.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167364.diff


7 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+27) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+2) 
- (modified) llvm/test/CodeGen/AMDGPU/carryout-selection.ll (+168-178) 
- (added) llvm/test/CodeGen/AMDGPU/redundant-cmp-reg-sequence.ll (+22) 
- (modified) llvm/test/CodeGen/AMDGPU/s_cmp_0.ll (-4) 
- (modified) llvm/test/CodeGen/AMDGPU/srem.ll (+39-59) 
- (modified) llvm/test/CodeGen/AMDGPU/wave32.ll (+56-59) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..951df6899f5a1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1312,6 +1312,30 @@ Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,
   return Reg;
 }
 
+MachineInstr *
+SIInstrInfo::pierceThroughRegSequence(const MachineInstr &MI) const {
+  if (MI.getOpcode() != AMDGPU::REG_SEQUENCE)
+    return nullptr;
+
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  int64_t SubRegValues[2];
+  bool SubRegIsConst[2];
+  MachineInstr *RealDefs[2];
+  for (unsigned I : {2, 4}) {
+    unsigned ArrayIdx = MI.getOperand(I).getImm() == AMDGPU::sub0 ? 0 : 1;
+    Register Subreg = MI.getOperand(I - 1).getReg();
+    RealDefs[ArrayIdx] = MRI.getUniqueVRegDef(Subreg);
+    SubRegIsConst[ArrayIdx] = getConstValDefinedInReg(
+        *RealDefs[ArrayIdx], Subreg, SubRegValues[ArrayIdx]);
+  }
+
+  for (unsigned I : {0, 1})
+    if (SubRegIsConst[I] && !SubRegValues[I])
+      return RealDefs[(I + 1) % 2];
+
+  return nullptr;
+}
+
 bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
                                           const Register Reg,
                                           int64_t &ImmVal) const {
@@ -10676,6 +10700,9 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     if (!Def || Def->getParent() != CmpInstr.getParent())
       return false;
 
+    if (MachineInstr *RegSequenceDef = pierceThroughRegSequence(*Def))
+      Def = RegSequenceDef;
+
     // For S_OP that set SCC = DST!=0, do the transformation
     //
     //   s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 0643b532ea04c..d7d049f722b47 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -714,6 +714,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     }
   }
 
+  MachineInstr *pierceThroughRegSequence(const MachineInstr &MI) const;
+
   static bool setsSCCifResultIsNonZero(const MachineInstr &MI) {
     switch (MI.getOpcode()) {
     case AMDGPU::S_ABSDIFF_I32:
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index b96de173dc8c6..e43967626c764 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -2120,8 +2120,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_or_b64 s[6:7], s[2:3], s[4:5]
-; VI-NEXT:    s_mov_b32 s6, 0
-; VI-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; VI-NEXT:    s_cbranch_scc0 .LBB16_3
 ; VI-NEXT:  ; %bb.1:
 ; VI-NEXT:    v_cvt_f32_u32_e32 v0, s4
@@ -2272,8 +2270,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_or_b64 s[4:5], s[2:3], s[6:7]
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB16_4
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
@@ -2422,10 +2418,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1010-NEXT:    s_clause 0x1
 ; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX1010-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1010-NEXT:    s_mov_b32 s8, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010-NEXT:    s_or_b64 s[4:5], s[2:3], s[6:7]
-; GFX1010-NEXT:    s_mov_b32 s4, 0
-; GFX1010-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX1010-NEXT:    s_cbranch_scc0 .LBB16_4
 ; GFX1010-NEXT:  ; %bb.1:
 ; GFX1010-NEXT:    v_cvt_f32_u32_e32 v0, s6
@@ -2440,71 +2435,71 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1010-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX1010-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1010-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX1010-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX1010-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX1010-NEXT:    s_mul_i32 s11, s9, s5
-; GFX1010-NEXT:    s_mul_hi_u32 s13, s9, s8
-; GFX1010-NEXT:    s_mul_i32 s12, s10, s8
+; GFX1010-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX1010-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX1010-NEXT:    s_mul_i32 s11, s9, s4
+; GFX1010-NEXT:    s_mul_hi_u32 s13, s9, s5
+; GFX1010-NEXT:    s_mul_i32 s12, s10, s5
 ; GFX1010-NEXT:    s_add_i32 s11, s13, s11
-; GFX1010-NEXT:    s_mul_i32 s14, s9, s8
+; GFX1010-NEXT:    s_mul_i32 s14, s9, s5
 ; GFX1010-NEXT:    s_add_i32 s11, s11, s12
-; GFX1010-NEXT:    s_mul_hi_u32 s13, s8, s14
-; GFX1010-NEXT:    s_mul_i32 s16, s8, s11
-; GFX1010-NEXT:    s_mul_hi_u32 s15, s5, s14
-; GFX1010-NEXT:    s_mul_i32 s12, s5, s14
-; GFX1010-NEXT:    s_mul_hi_u32 s14, s8, s11
+; GFX1010-NEXT:    s_mul_hi_u32 s13, s5, s14
+; GFX1010-NEXT:    s_mul_i32 s16, s5, s11
+; GFX1010-NEXT:    s_mul_hi_u32 s15, s4, s14
+; GFX1010-NEXT:    s_mul_i32 s12, s4, s14
+; GFX1010-NEXT:    s_mul_hi_u32 s14, s5, s11
 ; GFX1010-NEXT:    s_add_u32 s13, s13, s16
 ; GFX1010-NEXT:    s_addc_u32 s14, 0, s14
-; GFX1010-NEXT:    s_mul_hi_u32 s17, s5, s11
+; GFX1010-NEXT:    s_mul_hi_u32 s17, s4, s11
 ; GFX1010-NEXT:    s_add_u32 s12, s13, s12
-; GFX1010-NEXT:    s_mul_i32 s11, s5, s11
+; GFX1010-NEXT:    s_mul_i32 s11, s4, s11
 ; GFX1010-NEXT:    s_addc_u32 s12, s14, s15
 ; GFX1010-NEXT:    s_addc_u32 s13, s17, 0
 ; GFX1010-NEXT:    s_add_u32 s11, s12, s11
 ; GFX1010-NEXT:    s_addc_u32 s12, 0, s13
-; GFX1010-NEXT:    s_add_u32 s8, s8, s11
-; GFX1010-NEXT:    s_addc_u32 s5, s5, s12
-; GFX1010-NEXT:    s_mul_hi_u32 s11, s9, s8
-; GFX1010-NEXT:    s_mul_i32 s12, s9, s8
-; GFX1010-NEXT:    s_mul_i32 s9, s9, s5
-; GFX1010-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1010-NEXT:    s_add_u32 s5, s5, s11
+; GFX1010-NEXT:    s_addc_u32 s4, s4, s12
+; GFX1010-NEXT:    s_mul_hi_u32 s11, s9, s5
+; GFX1010-NEXT:    s_mul_i32 s12, s9, s5
+; GFX1010-NEXT:    s_mul_i32 s9, s9, s4
+; GFX1010-NEXT:    s_mul_i32 s10, s10, s5
 ; GFX1010-NEXT:    s_add_i32 s9, s11, s9
-; GFX1010-NEXT:    s_mul_i32 s11, s5, s12
+; GFX1010-NEXT:    s_mul_i32 s11, s4, s12
 ; GFX1010-NEXT:    s_add_i32 s9, s9, s10
-; GFX1010-NEXT:    s_mul_hi_u32 s10, s8, s12
-; GFX1010-NEXT:    s_mul_i32 s15, s8, s9
-; GFX1010-NEXT:    s_mul_hi_u32 s14, s8, s9
+; GFX1010-NEXT:    s_mul_hi_u32 s10, s5, s12
+; GFX1010-NEXT:    s_mul_i32 s15, s5, s9
+; GFX1010-NEXT:    s_mul_hi_u32 s14, s5, s9
 ; GFX1010-NEXT:    s_add_u32 s10, s10, s15
-; GFX1010-NEXT:    s_mul_hi_u32 s13, s5, s12
+; GFX1010-NEXT:    s_mul_hi_u32 s13, s4, s12
 ; GFX1010-NEXT:    s_addc_u32 s14, 0, s14
-; GFX1010-NEXT:    s_mul_hi_u32 s12, s5, s9
+; GFX1010-NEXT:    s_mul_hi_u32 s12, s4, s9
 ; GFX1010-NEXT:    s_add_u32 s10, s10, s11
-; GFX1010-NEXT:    s_mul_i32 s9, s5, s9
+; GFX1010-NEXT:    s_mul_i32 s9, s4, s9
 ; GFX1010-NEXT:    s_addc_u32 s10, s14, s13
 ; GFX1010-NEXT:    s_addc_u32 s11, s12, 0
 ; GFX1010-NEXT:    s_add_u32 s9, s10, s9
 ; GFX1010-NEXT:    s_addc_u32 s10, 0, s11
-; GFX1010-NEXT:    s_add_u32 s8, s8, s9
-; GFX1010-NEXT:    s_addc_u32 s5, s5, s10
-; GFX1010-NEXT:    s_mul_hi_u32 s9, s2, s8
-; GFX1010-NEXT:    s_mul_i32 s12, s2, s5
-; GFX1010-NEXT:    s_mul_hi_u32 s11, s2, s5
-; GFX1010-NEXT:    s_mul_hi_u32 s10, s3, s8
-; GFX1010-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1010-NEXT:    s_add_u32 s5, s5, s9
+; GFX1010-NEXT:    s_addc_u32 s4, s4, s10
+; GFX1010-NEXT:    s_mul_hi_u32 s9, s2, s5
+; GFX1010-NEXT:    s_mul_i32 s12, s2, s4
+; GFX1010-NEXT:    s_mul_hi_u32 s11, s2, s4
+; GFX1010-NEXT:    s_mul_hi_u32 s10, s3, s5
+; GFX1010-NEXT:    s_mul_i32 s5, s3, s5
 ; GFX1010-NEXT:    s_add_u32 s9, s9, s12
 ; GFX1010-NEXT:    s_addc_u32 s11, 0, s11
-; GFX1010-NEXT:    s_mul_hi_u32 s13, s3, s5
-; GFX1010-NEXT:    s_add_u32 s8, s9, s8
-; GFX1010-NEXT:    s_mul_i32 s5, s3, s5
-; GFX1010-NEXT:    s_addc_u32 s8, s11, s10
+; GFX1010-NEXT:    s_mul_hi_u32 s13, s3, s4
+; GFX1010-NEXT:    s_add_u32 s5, s9, s5
+; GFX1010-NEXT:    s_mul_i32 s4, s3, s4
+; GFX1010-NEXT:    s_addc_u32 s5, s11, s10
 ; GFX1010-NEXT:    s_addc_u32 s9, s13, 0
-; GFX1010-NEXT:    s_add_u32 s5, s8, s5
-; GFX1010-NEXT:    s_addc_u32 s8, 0, s9
-; GFX1010-NEXT:    s_mul_hi_u32 s9, s6, s5
-; GFX1010-NEXT:    s_mul_i32 s10, s6, s8
-; GFX1010-NEXT:    s_mul_i32 s11, s7, s5
-; GFX1010-NEXT:    s_add_i32 s9, s9, s10
+; GFX1010-NEXT:    s_add_u32 s4, s5, s4
+; GFX1010-NEXT:    s_addc_u32 s5, 0, s9
+; GFX1010-NEXT:    s_mul_hi_u32 s9, s6, s4
 ; GFX1010-NEXT:    s_mul_i32 s10, s6, s5
+; GFX1010-NEXT:    s_mul_i32 s11, s7, s4
+; GFX1010-NEXT:    s_add_i32 s9, s9, s10
+; GFX1010-NEXT:    s_mul_i32 s10, s6, s4
 ; GFX1010-NEXT:    s_add_i32 s9, s9, s11
 ; GFX1010-NEXT:    s_sub_i32 s11, s3, s9
 ; GFX1010-NEXT:    s_sub_u32 s10, s2, s10
@@ -2518,10 +2513,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1010-NEXT:    s_cselect_b32 s13, -1, 0
 ; GFX1010-NEXT:    s_cmp_eq_u32 s11, s7
 ; GFX1010-NEXT:    s_cselect_b32 s11, s13, s14
-; GFX1010-NEXT:    s_add_u32 s13, s5, 1
-; GFX1010-NEXT:    s_addc_u32 s14, s8, 0
-; GFX1010-NEXT:    s_add_u32 s15, s5, 2
-; GFX1010-NEXT:    s_addc_u32 s16, s8, 0
+; GFX1010-NEXT:    s_add_u32 s13, s4, 1
+; GFX1010-NEXT:    s_addc_u32 s14, s5, 0
+; GFX1010-NEXT:    s_add_u32 s15, s4, 2
+; GFX1010-NEXT:    s_addc_u32 s16, s5, 0
 ; GFX1010-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX1010-NEXT:    s_cselect_b32 s11, s15, s13
 ; GFX1010-NEXT:    s_cselect_b32 s13, s16, s14
@@ -2534,14 +2529,13 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1010-NEXT:    s_cmp_eq_u32 s3, s7
 ; GFX1010-NEXT:    s_cselect_b32 s3, s10, s9
 ; GFX1010-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX1010-NEXT:    s_cselect_b32 s9, s13, s8
-; GFX1010-NEXT:    s_cselect_b32 s8, s11, s5
-; GFX1010-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
+; GFX1010-NEXT:    s_cselect_b32 s5, s13, s5
+; GFX1010-NEXT:    s_cselect_b32 s4, s11, s4
+; GFX1010-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s8
 ; GFX1010-NEXT:    s_cbranch_vccnz .LBB16_3
 ; GFX1010-NEXT:  .LBB16_2:
 ; GFX1010-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX1010-NEXT:    s_sub_i32 s4, 0, s6
-; GFX1010-NEXT:    s_mov_b32 s9, 0
 ; GFX1010-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX1010-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX1010-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -2559,15 +2553,16 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1010-NEXT:    s_cselect_b32 s2, s5, s2
 ; GFX1010-NEXT:    s_add_i32 s4, s3, 1
 ; GFX1010-NEXT:    s_cmp_ge_u32 s2, s6
-; GFX1010-NEXT:    s_cselect_b32 s8, s4, s3
+; GFX1010-NEXT:    s_mov_b32 s5, 0
+; GFX1010-NEXT:    s_cselect_b32 s4, s4, s3
 ; GFX1010-NEXT:  .LBB16_3:
-; GFX1010-NEXT:    v_mov_b32_e32 v0, s8
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1010-NEXT:    v_mov_b32_e32 v1, s9
+; GFX1010-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1010-NEXT:    s_endpgm
 ; GFX1010-NEXT:  .LBB16_4:
-; GFX1010-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GFX1010-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX1010-NEXT:    s_branch .LBB16_2
 ;
 ; GFX1030W32-LABEL: sudiv64:
@@ -2575,10 +2570,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W32-NEXT:    s_clause 0x1
 ; GFX1030W32-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX1030W32-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX1030W32-NEXT:    s_mov_b32 s8, 0
 ; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030W32-NEXT:    s_or_b64 s[6:7], s[2:3], s[4:5]
-; GFX1030W32-NEXT:    s_mov_b32 s6, 0
-; GFX1030W32-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX1030W32-NEXT:    s_cbranch_scc0 .LBB16_4
 ; GFX1030W32-NEXT:  ; %bb.1:
 ; GFX1030W32-NEXT:    v_cvt_f32_u32_e32 v0, s4
@@ -2593,71 +2587,71 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W32-NEXT:    v_fmamk_f32 v0, v1, 0xcf800000, v0
 ; GFX1030W32-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1030W32-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX1030W32-NEXT:    v_readfirstlane_b32 s7, v1
-; GFX1030W32-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX1030W32-NEXT:    s_mul_i32 s11, s9, s7
-; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s9, s8
-; GFX1030W32-NEXT:    s_mul_i32 s12, s10, s8
+; GFX1030W32-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX1030W32-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX1030W32-NEXT:    s_mul_i32 s11, s9, s6
+; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s9, s7
+; GFX1030W32-NEXT:    s_mul_i32 s12, s10, s7
 ; GFX1030W32-NEXT:    s_add_i32 s11, s13, s11
-; GFX1030W32-NEXT:    s_mul_i32 s14, s9, s8
+; GFX1030W32-NEXT:    s_mul_i32 s14, s9, s7
 ; GFX1030W32-NEXT:    s_add_i32 s11, s11, s12
-; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s8, s14
-; GFX1030W32-NEXT:    s_mul_i32 s16, s8, s11
-; GFX1030W32-NEXT:    s_mul_hi_u32 s15, s7, s14
-; GFX1030W32-NEXT:    s_mul_i32 s12, s7, s14
-; GFX1030W32-NEXT:    s_mul_hi_u32 s14, s8, s11
+; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s7, s14
+; GFX1030W32-NEXT:    s_mul_i32 s16, s7, s11
+; GFX1030W32-NEXT:    s_mul_hi_u32 s15, s6, s14
+; GFX1030W32-NEXT:    s_mul_i32 s12, s6, s14
+; GFX1030W32-NEXT:    s_mul_hi_u32 s14, s7, s11
 ; GFX1030W32-NEXT:    s_add_u32 s13, s13, s16
 ; GFX1030W32-NEXT:    s_addc_u32 s14, 0, s14
-; GFX1030W32-NEXT:    s_mul_hi_u32 s17, s7, s11
+; GFX1030W32-NEXT:    s_mul_hi_u32 s17, s6, s11
 ; GFX1030W32-NEXT:    s_add_u32 s12, s13, s12
-; GFX1030W32-NEXT:    s_mul_i32 s11, s7, s11
+; GFX1030W32-NEXT:    s_mul_i32 s11, s6, s11
 ; GFX1030W32-NEXT:    s_addc_u32 s12, s14, s15
 ; GFX1030W32-NEXT:    s_addc_u32 s13, s17, 0
 ; GFX1030W32-NEXT:    s_add_u32 s11, s12, s11
 ; GFX1030W32-NEXT:    s_addc_u32 s12, 0, s13
-; GFX1030W32-NEXT:    s_add_u32 s8, s8, s11
-; GFX1030W32-NEXT:    s_addc_u32 s7, s7, s12
-; GFX1030W32-NEXT:    s_mul_hi_u32 s11, s9, s8
-; GFX1030W32-NEXT:    s_mul_i32 s12, s9, s8
-; GFX1030W32-NEXT:    s_mul_i32 s9, s9, s7
-; GFX1030W32-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1030W32-NEXT:    s_add_u32 s7, s7, s11
+; GFX1030W32-NEXT:    s_addc_u32 s6, s6, s12
+; GFX1030W32-NEXT:    s_mul_hi_u32 s11, s9, s7
+; GFX1030W32-NEXT:    s_mul_i32 s12, s9, s7
+; GFX1030W32-NEXT:    s_mul_i32 s9, s9, s6
+; GFX1030W32-NEXT:    s_mul_i32 s10, s10, s7
 ; GFX1030W32-NEXT:    s_add_i32 s9, s11, s9
-; GFX1030W32-NEXT:    s_mul_i32 s11, s7, s12
+; GFX1030W32-NEXT:    s_mul_i32 s11, s6, s12
 ; GFX1030W32-NEXT:    s_add_i32 s9, s9, s10
-; GFX1030W32-NEXT:    s_mul_hi_u32 s10, s8, s12
-; GFX1030W32-NEXT:    s_mul_i32 s15, s8, s9
-; GFX1030W32-NEXT:    s_mul_hi_u32 s14, s8, s9
+; GFX1030W32-NEXT:    s_mul_hi_u32 s10, s7, s12
+; GFX1030W32-NEXT:    s_mul_i32 s15, s7, s9
+; GFX1030W32-NEXT:    s_mul_hi_u32 s14, s7, s9
 ; GFX1030W32-NEXT:    s_add_u32 s10, s10, s15
-; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s7, s12
+; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s6, s12
 ; GFX1030W32-NEXT:    s_addc_u32 s14, 0, s14
-; GFX1030W32-NEXT:    s_mul_hi_u32 s12, s7, s9
+; GFX1030W32-NEXT:    s_mul_hi_u32 s12, s6, s9
 ; GFX1030W32-NEXT:    s_add_u32 s10, s10, s11
-; GFX1030W32-NEXT:    s_mul_i32 s9, s7, s9
+; GFX1030W32-NEXT:    s_mul_i32 s9, s6, s9
 ; GFX1030W32-NEXT:    s_addc_u32 s10, s14, s13
 ; GFX1030W32-NEXT:    s_addc_u32 s11, s12, 0
 ; GFX1030W32-NEXT:    s_add_u32 s9, s10, s9
 ; GFX1030W32-NEXT:    s_addc_u32 s10, 0, s11
-; GFX1030W32-NEXT:    s_add_u32 s8, s8, s9
-; GFX1030W32-NEXT:    s_addc_u32 s7, s7, s10
-; GFX1030W32-NEXT:    s_mul_hi_u32 s9, s2, s8
-; GFX1030W32-NEXT:    s_mul_i32 s12, s2, s7
-; GFX1030W32-NEXT:    s_mul_hi_u32 s11, s2, s7
-; GFX1030W32-NEXT:    s_mul_hi_u32 s10, s3, s8
-; GFX1030W32-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1030W32-NEXT:    s_add_u32 s7, s7, s9
+; GFX1030W32-NEXT:    s_addc_u32 s6, s6, s10
+; GFX1030W32-NEXT:    s_mul_hi_u32 s9, s2, s7
+; GFX1030W32-NEXT:    s_mul_i32 s12, s2, s6
+; GFX1030W32-NEXT:    s_mul_hi_u32 s11, s2, s6
+; GFX1030W32-NEXT:    s_mul_hi_u32 s10, s3, s7
+; GFX1030W32-NEXT:    s_mul_i32 s7, s3, s7
 ; GFX1030W32-NEXT:    s_add_u32 s9, s9, s12
 ; GFX1030W32-NEXT:    s_addc_u32 s11, 0, s11
-; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s3, s7
-; GFX1030W32-NEXT:    s_add_u32 s8, s9, s8
-; GFX1030W32-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1030W32-NEXT:    s_addc_u32 s8, s11, s10
+; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s3, s6
+; GFX1030W32-NEXT:    s_add_u32 s7, s9, s7
+; GFX1030W32-NEXT:    s_mul_i32 s6, s3, s6
+; GFX1030W32-NEXT:    s_addc_u32 s7, s11, s10
 ; GFX1030W32-NEXT:    s_addc_u32 s9, s13, 0
-; GFX1030W32-NEXT:    s_add_u32 s7, s8, s7
-; GFX1030W32-NEXT:    s_addc_u32 s8, 0, s9
-; GFX1030W32-NEXT:    s_mul_hi_u32 s9, s4, s7
-; GFX1030W32-NEXT:    s_mul_i32 s10, s4, s8
-; GFX1030W32-NEXT:    s_mul_i32 s11, s5, s7
-; GFX1030W32-NEXT:    s_add_i32 s9, s9, s10
+; GFX1030W32-NEXT:    s_add_u32 s6, s7, s6
+; GFX1030W32-NEXT:    s_addc_u32 s7, 0, s9
+; GFX1030W32-NEXT:    s_mul_hi_u32 s9, s4, s6
 ; GFX1030W32-NEXT:    s_mul_i32 s10, s4, s7
+; GFX1030W32-NEXT:    s_mul_i32 s11, s5, s6
+; GFX1030W32-NEXT:    s_add_i32 s9, s9, s10
+; GFX1030W32-NEXT:    s_mul_i32 s10, s4, s6
 ; GFX1030W32-NEXT:    s_add_i32 s9, s9, s11
 ; GFX1030W32-NEXT:    s_sub_i32 s11, s3, s9
 ; GFX1030W32-NEXT:    s_sub_u32 s10, s2, s10
@@ -2671,10 +2665,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W32-NEXT:    s_cselect_b32 s13, -1, 0
 ; GFX1030W32-NEXT:    s_cmp_eq_u32 s11, s5
 ; GFX1030W32-NEXT:    s_cselect_b32 s11, s13, s14
-; GFX1030W32-NEXT:    s_add_u32 s13, s7, 1
-; GFX1030W32-NEXT:    s_addc_u32 s14, s8, 0
-; GFX1030W32-NEXT:    s_add_u32 s15, s7, 2
-; GFX1030W32-NEXT:    s_addc_u32 s16, s8, 0
+; GFX1030W32-NEXT:    s_add_u32 s13, s6, 1
+; GFX1030W32-NEXT:    s_addc_u32 s14, s7, 0
+; GFX1030W32-NEXT:    s_add_u32 s15, s6, 2
+; GFX1030W32-NEXT:    s_addc_u32 s16, s7, 0
 ; GFX1030W32-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX1030W32-NEXT:    s_cselect_b32 s11, s15, s13
 ; GFX1030W32-NEXT:    s_cselect_b32 s13, s16, s14
@@ -2687,14 +2681,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W32-NEXT:    s_cmp_eq_u32 s3, s5
 ; GFX1030W32-NEXT:    s_cselect_b32 s3, s10, s9
 ; GFX1030W32-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX1030W32-NEXT:    s_cselect_b32 s9, s13, s8
-; GFX1030W32-NEXT:    s_cselect_b32 s8, s11, s7
-; GFX1030W32-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s6
+; GFX1030W32-NEXT:    s_cselect_b32 s7, s13, s7
+; GFX1030W32-NEXT:    s_cselect_b32 s6, s11, s6
+; GFX1030W32-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s8
 ; GFX1030W32-NEXT:    s_cbranch_vccnz .LBB16_3
 ; GFX1030W32-NEXT:  .LBB16_2:
 ; GFX1030W32-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX1030W32-NEXT:    s_sub_i32 s5, 0, s4
-; GFX1030W32-NEXT:    s_mov_b32 s9, 0
+; GFX1030W32-NEXT:    s_mov_b32 s7, 0
 ; GFX1030W32-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX1030W32-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX1030W32-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -2712,15 +2706,15 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W32-NEXT:    s_cselect_b32 s2, s6, s2
 ; GFX1030W32-NEXT:    s_add_i32 s5, s3, 1
 ; GFX1030W32-NEXT:    s_cmp_ge_u32 s2, s4
-; GFX1030W32-NEXT:    s_cselect_b32 s8, s5, s3
+; GFX1030W32-NEXT:    s_cselect_b32 s6, s5, s3
 ; GFX1030W32-NEXT:  .LBB16_3:
-; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s8
+; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s9
+; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1030W32-NEXT:    s_endpgm
 ; GFX1030W32-NEXT:  .LBB16_4:
-; GFX1030W32-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GFX1030W32-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1030W32-NEXT:    s_branch .LBB16_2
 ;
 ; GFX1030W64-LABEL: sudiv64:
@@ -2730,8 +2724,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W64-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030W64-NEXT:    s_or_b64 s[6:7], s[2:3], s[4:5]
-; GFX1030W64-NEXT:    s_mov_b32 s6, 0
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX1030W64-NEXT:    s_cbranch_scc0 .LBB16_4
 ; GFX1030W64-NEXT:  ; %bb.1:
 ; GFX1030W64-NEXT:    v_cvt_f32_u32_e32 v0, s4
@@ -2880,11 +2872,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %ou...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/167364


More information about the llvm-commits mailing list