[llvm] [AMDGPU] Calc IsVALU correctly during UADDO/USUBO selection (PR #159814)

via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 19 10:07:43 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: None (LU-JOHN)

<details>
<summary>Changes</summary>

Fix two bugs.  The first bug hid the second bug.

1. Calculate IsVALU correctly during UADDO/USUBO selection.  IsVALU should be false if the carryout users are UADDO_CARRY/USUBO_CARRY.  However instruction selection visits uses before defs, so the UADDO_CARRY/USUBO_CARRY nodes are normally (probably always) already converted to S_ADD_CO_PSEUDO/S_SUB_CO_PSEUDO.  Fix to check for these machine opcodes.  

2. Without this fix, UADDO/USUBO selection will always select the VALU instructions V_ADD_CO__U32_e64/V_SUB_CO_U32_e64.   S_UADDO_PSEUDO/S_USUBO_PSEUDO were never selected in the CodeGen/AMDGPU tests.  Thus, S_UADDO_PSEUDO/S_USUBO_PSEUDO cases were never hit in EmitInstrWithCustomInserter.  The code generation for S_UADDO_PSEUDO/S_USUBO_PSEUDO had a bug where it could not handle code generation for 32-bit $scc_out. 

---

Patch is 501.83 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159814.diff


11 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+11-4) 
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+8-3) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll (+1674-1496) 
- (modified) llvm/test/CodeGen/AMDGPU/carryout-selection.ll (+898-917) 
- (modified) llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll (+32-29) 
- (modified) llvm/test/CodeGen/AMDGPU/sdiv64.ll (+277-215) 
- (modified) llvm/test/CodeGen/AMDGPU/srem.ll (+1724-1605) 
- (modified) llvm/test/CodeGen/AMDGPU/srem64.ll (+428-317) 
- (modified) llvm/test/CodeGen/AMDGPU/udiv64.ll (+121-88) 
- (modified) llvm/test/CodeGen/AMDGPU/urem64.ll (+273-196) 
- (modified) llvm/test/CodeGen/AMDGPU/wave32.ll (+255-259) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c2fca79979e1b..f544c215c8661 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1089,10 +1089,17 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
   for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
        ++UI)
     if (UI.getUse().getResNo() == 1) {
-      if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
-          (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
-        IsVALU = true;
-        break;
+      if (UI->isMachineOpcode()) {
+        if (UI->getMachineOpcode() !=
+            (IsAdd ? AMDGPU::S_ADD_CO_PSEUDO : AMDGPU::S_SUB_CO_PSEUDO)) {
+          IsVALU = true;
+          break;
+        }
+      } else {
+        if (UI->getOpcode() != (IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY)) {
+          IsVALU = true;
+          break;
+        }
       }
     }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 363717b017ef0..e50bc6c4d363a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5946,6 +5946,8 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
   case AMDGPU::S_UADDO_PSEUDO:
   case AMDGPU::S_USUBO_PSEUDO: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
     const DebugLoc &DL = MI.getDebugLoc();
     MachineOperand &Dest0 = MI.getOperand(0);
     MachineOperand &Dest1 = MI.getOperand(1);
@@ -5961,9 +5963,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
         .add(Src1);
     // clang-format on
 
-    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
-        .addImm(1)
-        .addImm(0);
+    const TargetRegisterClass *Dest1RC = MRI.getRegClass(Dest1.getReg());
+    unsigned Dest1Size = TRI->getRegSizeInBits(*Dest1RC);
+    assert(Dest1Size == 64 || Dest1Size == 32);
+    unsigned SelOpc =
+        (Dest1Size == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
+    BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(1).addImm(0);
 
     MI.eraseFromParent();
     return BB;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index b2dcd77274989..e68353e5223fb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7792,8 +7792,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], 0x1000, s0
 ; GFX6-NEXT:    s_ashr_i32 s8, s1, 31
@@ -7803,143 +7804,175 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[8:9]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_sub_u32 s4, 0, s10
-; GFX6-NEXT:    s_subb_u32 s5, 0, s11
+; GFX6-NEXT:    s_sub_u32 s12, 0, s10
+; GFX6-NEXT:    s_subb_u32 s13, 0, s11
 ; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_ashr_i32 s12, s3, 31
-; GFX6-NEXT:    s_add_u32 s2, s2, s12
-; GFX6-NEXT:    s_mov_b32 s13, s12
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    s_addc_u32 s3, s3, s12
-; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
-; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
-; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v0
-; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
-; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
-; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
-; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
-; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
-; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
-; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GFX6-NEXT:    v_mul_lo_u32 v4, s11, v0
-; GFX6-NEXT:    v_mov_b32_e32 v5, s11
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT:    v_mul_lo_u32 v3, s10, v0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
-; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
-; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s10, v3
-; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v5
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
-; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 1, v0
-; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
-; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v0
-; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
-; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v5, v7, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, v6, v8, s[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v6, s3
-; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX6-NEXT:    s_xor_b64 s[0:1], s[12:13], s[8:9]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX6-NEXT:    v_xor_b32_e32 v1, s1, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    s_mul_i32 s1, s12, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
+; GFX6-NEXT:    s_mul_i32 s15, s13, s0
+; GFX6-NEXT:    s_mul_i32 s16, s12, s0
+; GFX6-NEXT:    s_add_i32 s1, s17, s1
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s16
+; GFX6-NEXT:    s_add_i32 s1, s1, s15
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s1
+; GFX6-NEXT:    v_mul_hi_u32 v4, v1, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v3
+; GFX6-NEXT:    s_mul_i32 s17, s0, s1
+; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s1
+; GFX6-NEXT:    s_add_u32 s15, s15, s17
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v0
+; GFX6-NEXT:    s_addc_u32 s17, 0, s17
+; GFX6-NEXT:    s_mul_i32 s16, s14, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s18, v4
+; GFX6-NEXT:    s_add_u32 s15, s15, s16
+; GFX6-NEXT:    s_addc_u32 s15, s17, s18
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
+; GFX6-NEXT:    s_addc_u32 s16, s16, 0
+; GFX6-NEXT:    s_mul_i32 s1, s14, s1
+; GFX6-NEXT:    s_add_u32 s1, s15, s1
+; GFX6-NEXT:    s_addc_u32 s15, 0, s16
+; GFX6-NEXT:    s_add_i32 s16, s0, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s16
+; GFX6-NEXT:    s_cselect_b64 s[0:1], 1, 0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX6-NEXT:    s_addc_u32 s14, s14, s15
+; GFX6-NEXT:    s_mul_i32 s0, s12, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX6-NEXT:    s_add_i32 s0, s1, s0
+; GFX6-NEXT:    s_mul_i32 s13, s13, s16
+; GFX6-NEXT:    s_mul_i32 s1, s12, s16
+; GFX6-NEXT:    s_add_i32 s0, s0, s13
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    v_mul_hi_u32 v3, s14, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s16, v2
+; GFX6-NEXT:    v_mul_hi_u32 v1, s14, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s16, v0
+; GFX6-NEXT:    s_mul_i32 s13, s16, s0
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
+; GFX6-NEXT:    s_add_u32 s13, s17, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
+; GFX6-NEXT:    s_mul_i32 s1, s14, s1
+; GFX6-NEXT:    s_addc_u32 s15, 0, s15
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v3
+; GFX6-NEXT:    s_add_u32 s1, s13, s1
+; GFX6-NEXT:    s_addc_u32 s1, s15, s12
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX6-NEXT:    s_addc_u32 s12, s12, 0
+; GFX6-NEXT:    s_mul_i32 s0, s14, s0
+; GFX6-NEXT:    s_add_u32 s0, s1, s0
+; GFX6-NEXT:    s_addc_u32 s12, 0, s12
+; GFX6-NEXT:    s_add_i32 s15, s16, s0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], 1, 0
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX6-NEXT:    s_addc_u32 s14, s14, s12
+; GFX6-NEXT:    s_ashr_i32 s12, s7, 31
+; GFX6-NEXT:    s_add_u32 s0, s6, s12
+; GFX6-NEXT:    s_mov_b32 s13, s12
+; GFX6-NEXT:    s_addc_u32 s1, s7, s12
+; GFX6-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
+; GFX6-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NEXT:    v_mul_hi_u32 v1, s6, v0
+; GFX6-NEXT:    v_mov_b32_e32 v2, s15
+; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v2
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX6-NEXT:    v_mul_hi_u32 v1, s7, v2
+; GFX6-NEXT:    s_mul_i32 s1, s6, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v3
+; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
+; GFX6-NEXT:    s_add_u32 s1, s16, s1
+; GFX6-NEXT:    s_addc_u32 s4, 0, s4
+; GFX6-NEXT:    s_mul_i32 s15, s7, s15
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
+; GFX6-NEXT:    s_add_u32 s1, s1, s15
+; GFX6-NEXT:    s_addc_u32 s1, s4, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX6-NEXT:    s_addc_u32 s4, s4, 0
+; GFX6-NEXT:    s_mul_i32 s14, s7, s14
+; GFX6-NEXT:    s_add_u32 s14, s1, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GFX6-NEXT:    s_addc_u32 s15, 0, s4
+; GFX6-NEXT:    s_mov_b32 s1, s5
+; GFX6-NEXT:    s_mul_i32 s4, s10, s15
+; GFX6-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX6-NEXT:    s_add_i32 s4, s5, s4
+; GFX6-NEXT:    s_mul_i32 s5, s11, s14
+; GFX6-NEXT:    s_add_i32 s16, s4, s5
+; GFX6-NEXT:    s_sub_i32 s17, s7, s16
+; GFX6-NEXT:    s_mul_i32 s4, s10, s14
+; GFX6-NEXT:    s_sub_i32 s6, s6, s4
+; GFX6-NEXT:    s_cselect_b64 s[4:5], 1, 0
+; GFX6-NEXT:    s_or_b32 s18, s4, s5
+; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_subb_u32 s17, s17, s11
+; GFX6-NEXT:    s_sub_i32 s19, s6, s10
+; GFX6-NEXT:    s_cselect_b64 s[4:5], 1, 0
+; GFX6-NEXT:    s_or_b32 s4, s4, s5
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX6-NEXT:    s_subb_u32 s4, s17, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s11
+; GFX6-NEXT:    s_cselect_b32 s5, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s19, s10
+; GFX6-NEXT:    s_cselect_b32 s17, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s4, s11
+; GFX6-NEXT:    s_cselect_b32 s4, s17, s5
+; GFX6-NEXT:    s_add_u32 s5, s14, 1
+; GFX6-NEXT:    s_addc_u32 s17, s15, 0
+; GFX6-NEXT:    s_add_u32 s19, s14, 2
+; GFX6-NEXT:    s_addc_u32 s20, s15, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX6-NEXT:    s_cselect_b32 s4, s19, s5
+; GFX6-NEXT:    s_cselect_b32 s5, s20, s17
+; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_subb_u32 s7, s7, s16
+; GFX6-NEXT:    s_cmp_ge_u32 s7, s11
+; GFX6-NEXT:    s_cselect_b32 s16, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s6, s10
+; GFX6-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s7, s11
+; GFX6-NEXT:    s_cselect_b32 s6, s6, s16
+; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX6-NEXT:    s_cselect_b32 s5, s5, s15
+; GFX6-NEXT:    s_cselect_b32 s4, s4, s14
+; GFX6-NEXT:    s_xor_b64 s[6:7], s[12:13], s[8:9]
+; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
+; GFX6-NEXT:    s_sub_u32 s4, s4, s6
+; GFX6-NEXT:    s_subb_u32 s5, s5, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], 0x1000, s0
-; GFX9-NEXT:    s_ashr_i32 s2, s1, 31
-; GFX9-NEXT:    s_add_u32 s0, s0, s2
-; GFX9-NEXT:    s_mov_b32 s3, s2
-; GFX9-NEXT:    s_addc_u32 s1, s1, s2
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[2:3]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT:    s_sub_u32 s0, 0, s6
-; GFX9-NEXT:    s_subb_u32 s1, 0, s7
+; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
+; GFX9-NEXT:    s_add_u32 s0, s0, s6
+; GFX9-NEXT:    s_mov_b32 s7, s6
+; GFX9-NEXT:    s_addc_u32 s1, s1, s6
+; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[6:7]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_sub_u32 s10, 0, s8
+; GFX9-NEXT:    s_subb_u32 s11, 0, s9
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -7949,130 +7982,122 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_madmk_f32 v1, v2, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX9-NEXT:    s_mul_i32 s12, s0, s4
-; GFX9-NEXT:    s_mul_hi_u32 s14, s0, s5
-; GFX9-NEXT:    s_mul_i32 s13, s1, s5
-; GFX9-NEXT:    s_add_i32 s12, s14, s12
-; GFX9-NEXT:    s_mul_i32 s15, s0, s5
-; GFX9-NEXT:    s_add_i32 s12, s12, s13
-; GFX9-NEXT:    s_mul_hi_u32 s14, s5, s15
-; GFX9-NEXT:    s_mul_hi_u32 s13, s5, s12
-; GFX9-NEXT:    s_mul_i32 s5, s5, s12
-; GFX9-NEXT:    s_add_u32 s5, s14, s5
+; GFX9-NEXT:    v_readfirstlane_b32 s12, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    s_mul_i32 s5, s10, s12
+; GFX9-NEXT:    s_mul_hi_u32 s14, s10, s4
+; GFX9-NEXT:    s_mul_i32 s13, s11, s4
+; GFX9-NEXT:    s_add_i32 s5, s14, s5
+; GFX9-NEXT:    s_mul_i32 s15, s10, s4
+; GFX9-NEXT:    s_add_i32 s5, s5, s13
+; GFX9-NEXT:    s_mul_hi_u32 s14, s4, s15
+; GFX9-NEXT:    s_mul_i32 s16, s4, s5
+; GFX9-NEXT:    s_mul_hi_u32 s13, s4, s5
+; GFX9-NEXT:    s_add_u32 s14, s14, s16
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s13
-; GFX9-NEXT:    s_mul_hi_u32 s16, s4, s15
-; GFX9-NEXT:    s_mul_i32 s15, s4, s15
-; GFX9-NEXT:    s_add_u32 s5, s5, s15
-; GFX9-NEXT:    s_mul_hi_u32 s14, s4, s12
-; GFX9-NEXT:    s_addc_u32 s5, s13, s16
-; GFX9-NEXT:    s_addc_u32 s13, s14, 0
-; GFX9-NEXT:    s_mul_i32 s12, s4, s12
-; GFX9-NEXT:    s_add_u32 s5, s5, s12
-; GFX9-NEXT:    s_addc_u32 s12, 0, s13
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s5, v1
-; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_addc_u32 s4, s4, s12
-; GFX9-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX9-NEXT:    s_mul_i32 s5, s0, s4
-; GFX9-NEXT:    s_mul_hi_u32 s13, s0, s12
-; GFX9-NEXT:    s_add_i32 s5, s13, s5
-; GFX9-NEXT:    s_mul_i32 s1, s1, s12
-; GFX9-NEXT:    s_add_i32 s5, s5, s1
-; GFX9-NEXT:    s_mul_i32 s0, s0, s12
-; GFX9-NEXT:    s_mul_hi_u32 s13, s4, s0
-; GFX9-NEXT:    s_mul_i32 s14, s4, s0
-; GFX9-NEXT:    s_mul_i32 s16, s12, s5
-; GFX9-NEXT:    s_mul_hi_u32 s0, s12, s0
-; GFX9-NEXT:    s_mul_hi_u32 s15, s12, s5
-; GFX9-NEXT:    s_add_u32 s0, s0, s16
-; GFX9-NEXT:    s_addc_u32 s12, 0, s15
-; GFX9-NEXT:    s_add_u32 s0, s0, s14
-; GFX9-NEXT:    s_mul_hi_u32 s1, s4, s5
-; GFX9-NEXT:    s_addc_u32 s0, s12, s13
-; GFX9-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NEXT:    s_mul_i32 s5, s4, s5
-; GFX9-NEXT:    s_add_u32 s0, s0, s5
-; GFX9-NEXT:    s_addc_u32 s1, 0, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
-; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_addc_u32 s12, s4, s1
-; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
-; GFX9-NEXT:    s_add_u32 s0, s10, s4
+; GFX9-NEXT:    s_mul_hi_u32 s17, s12, s15
+; GFX9-NEXT:    s_mul_i32 s15, s12, s15
+; GFX9-NEXT:    s_add_u32 s14, s14, s15
+; GFX9-NEXT:    s_mul_hi_u32 s16, s12, s5
+; GFX9-NEXT:    s_addc_u32 s13, s13, s17
+; GFX9-NEXT:    s_addc_u32 s14, s16, 0
+; GFX9-NEXT:    s_mul_i32 s5, s12, s5
+; GFX9-NEXT:    s_add_u32 s5, s13, s5
+; GFX9-NEXT:    s_addc_u32 s13, 0, s14
+; GFX9-NEXT:    s_add_i32 s14, s4, s5
+; GFX9-NEXT:    s_cselect_b64 s[4:5], 1, 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT:    s_addc_u32 s12, s12, s13
+; GFX9-NEXT:    s_mul_i32 s4, s10, s12
+; GFX9-NEXT:    s_mul_hi_u32 s5, s10, s14
+; GFX9-NEXT:    s_add_i32 s4, s5, s4
+; GFX9-NEXT:    s_mul_i32 s11, s11, s14
+; GFX9-NEXT:    s_add_i32 s4, s4, s11
+; GFX9-NEXT:    s_mul_i32 s10, s10, s14
+; GFX9-NEXT:    s_mul_hi_u32 s11, s12, s10
+; GFX9-NEXT:    s_mul_i32 s13, s12, s10
+; GFX9-NEXT:    s_mul_i32 s16, s14, s4
+; GFX9-NEXT:    s_mul_hi_u32 s10, s14, s10
+; GFX9-NEXT:    s_mul_hi_u32 s15, s14, s4
+; GFX9-NEXT:    s_add_u32 s10, s10, s16
+; GFX9-NEXT:    s_addc_u32 s15, 0, s15
+; GFX9-NEXT:    s_add_u32 s10, s10, s13
+; GFX9-NEXT:    s_mul_hi_u32 s5, s12, s4
+; GFX9-NEXT:    s_addc_u32 s10, s15, s11
+; GFX9-NEXT:    s_addc_u32 s5, s5, 0
+; GFX9-NEXT:    s_mul_i32 s4, s12, s4
+; GFX9-NEXT:    s_add_u32 s4, s10, s4
+; GFX9-NEXT:    s_addc_u32 s10, 0, s5
+; GFX9-NEXT:    s_add_i32 s14, s14, s4
+; GFX9-NEXT:    s_cselect_b64 s[4:5], 1, 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT:    s_addc_u32 s10, s12, s10
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX9-NEXT...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/159814


More information about the llvm-commits mailing list