[llvm] [AMDGPU] Calc IsVALU correctly during UADDO/USUBO selection (PR #159814)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 19 10:07:43 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: None (LU-JOHN)
<details>
<summary>Changes</summary>
Fix two bugs. The first bug hid the second bug.
1. Calculate IsVALU correctly during UADDO/USUBO selection. IsVALU should be false if the carryout users are UADDO_CARRY/USUBO_CARRY. However instruction selection visits uses before defs, so the UADDO_CARRY/USUBO_CARRY nodes are normally (probably always) already converted to S_ADD_CO_PSEUDO/S_SUB_CO_PSEUDO. Fix to check for these machine opcodes.
2. Without this fix, UADDO/USUBO selection will always select the VALU instructions V_ADD_CO__U32_e64/V_SUB_CO_U32_e64. S_UADDO_PSEUDO/S_USUBO_PSEUDO were never selected in the CodeGen/AMDGPU tests. Thus, S_UADDO_PSEUDO/S_USUBO_PSEUDO cases were never hit in EmitInstrWithCustomInserter. The code generation for S_UADDO_PSEUDO/S_USUBO_PSEUDO had a bug where it could not handle code generation for 32-bit $scc_out.
---
Patch is 501.83 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159814.diff
11 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+11-4)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+8-3)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll (+1674-1496)
- (modified) llvm/test/CodeGen/AMDGPU/carryout-selection.ll (+898-917)
- (modified) llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll (+32-29)
- (modified) llvm/test/CodeGen/AMDGPU/sdiv64.ll (+277-215)
- (modified) llvm/test/CodeGen/AMDGPU/srem.ll (+1724-1605)
- (modified) llvm/test/CodeGen/AMDGPU/srem64.ll (+428-317)
- (modified) llvm/test/CodeGen/AMDGPU/udiv64.ll (+121-88)
- (modified) llvm/test/CodeGen/AMDGPU/urem64.ll (+273-196)
- (modified) llvm/test/CodeGen/AMDGPU/wave32.ll (+255-259)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c2fca79979e1b..f544c215c8661 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1089,10 +1089,17 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
++UI)
if (UI.getUse().getResNo() == 1) {
- if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
- (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
- IsVALU = true;
- break;
+ if (UI->isMachineOpcode()) {
+ if (UI->getMachineOpcode() !=
+ (IsAdd ? AMDGPU::S_ADD_CO_PSEUDO : AMDGPU::S_SUB_CO_PSEUDO)) {
+ IsVALU = true;
+ break;
+ }
+ } else {
+ if (UI->getOpcode() != (IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY)) {
+ IsVALU = true;
+ break;
+ }
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 363717b017ef0..e50bc6c4d363a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5946,6 +5946,8 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
const DebugLoc &DL = MI.getDebugLoc();
MachineOperand &Dest0 = MI.getOperand(0);
MachineOperand &Dest1 = MI.getOperand(1);
@@ -5961,9 +5963,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.add(Src1);
// clang-format on
- BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
- .addImm(1)
- .addImm(0);
+ const TargetRegisterClass *Dest1RC = MRI.getRegClass(Dest1.getReg());
+ unsigned Dest1Size = TRI->getRegSizeInBits(*Dest1RC);
+ assert(Dest1Size == 64 || Dest1Size == 32);
+ unsigned SelOpc =
+ (Dest1Size == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
+ BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(1).addImm(0);
MI.eraseFromParent();
return BB;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index b2dcd77274989..e68353e5223fb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7792,8 +7792,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: s_mov_b32 s6, -1
+; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0
; GFX6-NEXT: s_ashr_i32 s8, s1, 31
@@ -7803,143 +7804,175 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9]
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11
-; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT: s_sub_u32 s4, 0, s10
-; GFX6-NEXT: s_subb_u32 s5, 0, s11
+; GFX6-NEXT: s_sub_u32 s12, 0, s10
+; GFX6-NEXT: s_subb_u32 s13, 0, s11
; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_ashr_i32 s12, s3, 31
-; GFX6-NEXT: s_add_u32 s2, s2, s12
-; GFX6-NEXT: s_mov_b32 s13, s12
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
-; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT: s_addc_u32 s3, s3, s12
-; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13]
-; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1
-; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0
-; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0
-; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4
-; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2
-; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2
-; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4
-; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4
-; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1
-; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0
-; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0
-; GFX6-NEXT: s_mov_b32 s5, s1
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2
-; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3
-; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2
-; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3
-; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3
-; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2
-; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc
-; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1
-; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0
-; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1
-; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1
-; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0
-; GFX6-NEXT: s_mov_b32 s4, s0
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1
-; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0
-; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0
-; GFX6-NEXT: v_mov_b32_e32 v5, s11
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3
-; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
-; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s10, v3
-; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v4
-; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v5
-; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v4
-; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1]
-; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0
-; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
-; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0
-; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
-; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1]
-; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1]
-; GFX6-NEXT: v_mov_b32_e32 v6, s3
-; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc
-; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v2
-; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3
-; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s11, v2
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[8:9]
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0
-; GFX6-NEXT: v_xor_b32_e32 v1, s1, v1
+; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0
+; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: s_mul_i32 s1, s12, s14
+; GFX6-NEXT: v_readfirstlane_b32 s17, v2
+; GFX6-NEXT: s_mul_i32 s15, s13, s0
+; GFX6-NEXT: s_mul_i32 s16, s12, s0
+; GFX6-NEXT: s_add_i32 s1, s17, s1
+; GFX6-NEXT: v_mul_hi_u32 v3, v0, s16
+; GFX6-NEXT: s_add_i32 s1, s1, s15
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1
+; GFX6-NEXT: v_mul_hi_u32 v4, v1, s16
+; GFX6-NEXT: v_readfirstlane_b32 s15, v3
+; GFX6-NEXT: s_mul_i32 s17, s0, s1
+; GFX6-NEXT: v_mul_hi_u32 v1, v1, s1
+; GFX6-NEXT: s_add_u32 s15, s15, s17
+; GFX6-NEXT: v_readfirstlane_b32 s17, v0
+; GFX6-NEXT: s_addc_u32 s17, 0, s17
+; GFX6-NEXT: s_mul_i32 s16, s14, s16
+; GFX6-NEXT: v_readfirstlane_b32 s18, v4
+; GFX6-NEXT: s_add_u32 s15, s15, s16
+; GFX6-NEXT: s_addc_u32 s15, s17, s18
+; GFX6-NEXT: v_readfirstlane_b32 s16, v1
+; GFX6-NEXT: s_addc_u32 s16, s16, 0
+; GFX6-NEXT: s_mul_i32 s1, s14, s1
+; GFX6-NEXT: s_add_u32 s1, s15, s1
+; GFX6-NEXT: s_addc_u32 s15, 0, s16
+; GFX6-NEXT: s_add_i32 s16, s0, s1
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: s_cselect_b64 s[0:1], 1, 0
+; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_cmp_lg_u32 s0, 0
+; GFX6-NEXT: s_addc_u32 s14, s14, s15
+; GFX6-NEXT: s_mul_i32 s0, s12, s14
+; GFX6-NEXT: v_readfirstlane_b32 s1, v0
+; GFX6-NEXT: s_add_i32 s0, s1, s0
+; GFX6-NEXT: s_mul_i32 s13, s13, s16
+; GFX6-NEXT: s_mul_i32 s1, s12, s16
+; GFX6-NEXT: s_add_i32 s0, s0, s13
; GFX6-NEXT: v_mov_b32_e32 v2, s1
-; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
+; GFX6-NEXT: v_mul_hi_u32 v3, s14, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s16, v2
+; GFX6-NEXT: v_mul_hi_u32 v1, s14, v0
+; GFX6-NEXT: v_mul_hi_u32 v0, s16, v0
+; GFX6-NEXT: s_mul_i32 s13, s16, s0
+; GFX6-NEXT: v_readfirstlane_b32 s17, v2
+; GFX6-NEXT: s_add_u32 s13, s17, s13
+; GFX6-NEXT: v_readfirstlane_b32 s15, v0
+; GFX6-NEXT: s_mul_i32 s1, s14, s1
+; GFX6-NEXT: s_addc_u32 s15, 0, s15
+; GFX6-NEXT: v_readfirstlane_b32 s12, v3
+; GFX6-NEXT: s_add_u32 s1, s13, s1
+; GFX6-NEXT: s_addc_u32 s1, s15, s12
+; GFX6-NEXT: v_readfirstlane_b32 s12, v1
+; GFX6-NEXT: s_addc_u32 s12, s12, 0
+; GFX6-NEXT: s_mul_i32 s0, s14, s0
+; GFX6-NEXT: s_add_u32 s0, s1, s0
+; GFX6-NEXT: s_addc_u32 s12, 0, s12
+; GFX6-NEXT: s_add_i32 s15, s16, s0
+; GFX6-NEXT: s_cselect_b64 s[0:1], 1, 0
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_cmp_lg_u32 s0, 0
+; GFX6-NEXT: s_addc_u32 s14, s14, s12
+; GFX6-NEXT: s_ashr_i32 s12, s7, 31
+; GFX6-NEXT: s_add_u32 s0, s6, s12
+; GFX6-NEXT: s_mov_b32 s13, s12
+; GFX6-NEXT: s_addc_u32 s1, s7, s12
+; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13]
+; GFX6-NEXT: v_mov_b32_e32 v0, s14
+; GFX6-NEXT: v_mul_hi_u32 v1, s6, v0
+; GFX6-NEXT: v_mov_b32_e32 v2, s15
+; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2
+; GFX6-NEXT: s_mov_b32 s0, s4
+; GFX6-NEXT: v_readfirstlane_b32 s4, v1
+; GFX6-NEXT: v_mul_hi_u32 v1, s7, v2
+; GFX6-NEXT: s_mul_i32 s1, s6, s14
+; GFX6-NEXT: v_readfirstlane_b32 s16, v3
+; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0
+; GFX6-NEXT: s_add_u32 s1, s16, s1
+; GFX6-NEXT: s_addc_u32 s4, 0, s4
+; GFX6-NEXT: s_mul_i32 s15, s7, s15
+; GFX6-NEXT: v_readfirstlane_b32 s16, v1
+; GFX6-NEXT: s_add_u32 s1, s1, s15
+; GFX6-NEXT: s_addc_u32 s1, s4, s16
+; GFX6-NEXT: v_readfirstlane_b32 s4, v0
+; GFX6-NEXT: s_addc_u32 s4, s4, 0
+; GFX6-NEXT: s_mul_i32 s14, s7, s14
+; GFX6-NEXT: s_add_u32 s14, s1, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s14
+; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
+; GFX6-NEXT: s_addc_u32 s15, 0, s4
+; GFX6-NEXT: s_mov_b32 s1, s5
+; GFX6-NEXT: s_mul_i32 s4, s10, s15
+; GFX6-NEXT: v_readfirstlane_b32 s5, v0
+; GFX6-NEXT: s_add_i32 s4, s5, s4
+; GFX6-NEXT: s_mul_i32 s5, s11, s14
+; GFX6-NEXT: s_add_i32 s16, s4, s5
+; GFX6-NEXT: s_sub_i32 s17, s7, s16
+; GFX6-NEXT: s_mul_i32 s4, s10, s14
+; GFX6-NEXT: s_sub_i32 s6, s6, s4
+; GFX6-NEXT: s_cselect_b64 s[4:5], 1, 0
+; GFX6-NEXT: s_or_b32 s18, s4, s5
+; GFX6-NEXT: s_cmp_lg_u32 s18, 0
+; GFX6-NEXT: s_subb_u32 s17, s17, s11
+; GFX6-NEXT: s_sub_i32 s19, s6, s10
+; GFX6-NEXT: s_cselect_b64 s[4:5], 1, 0
+; GFX6-NEXT: s_or_b32 s4, s4, s5
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_subb_u32 s4, s17, 0
+; GFX6-NEXT: s_cmp_ge_u32 s4, s11
+; GFX6-NEXT: s_cselect_b32 s5, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s19, s10
+; GFX6-NEXT: s_cselect_b32 s17, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s4, s11
+; GFX6-NEXT: s_cselect_b32 s4, s17, s5
+; GFX6-NEXT: s_add_u32 s5, s14, 1
+; GFX6-NEXT: s_addc_u32 s17, s15, 0
+; GFX6-NEXT: s_add_u32 s19, s14, 2
+; GFX6-NEXT: s_addc_u32 s20, s15, 0
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_cselect_b32 s4, s19, s5
+; GFX6-NEXT: s_cselect_b32 s5, s20, s17
+; GFX6-NEXT: s_cmp_lg_u32 s18, 0
+; GFX6-NEXT: s_subb_u32 s7, s7, s16
+; GFX6-NEXT: s_cmp_ge_u32 s7, s11
+; GFX6-NEXT: s_cselect_b32 s16, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s6, s10
+; GFX6-NEXT: s_cselect_b32 s6, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s7, s11
+; GFX6-NEXT: s_cselect_b32 s6, s6, s16
+; GFX6-NEXT: s_cmp_lg_u32 s6, 0
+; GFX6-NEXT: s_cselect_b32 s5, s5, s15
+; GFX6-NEXT: s_cselect_b32 s4, s4, s14
+; GFX6-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9]
+; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
+; GFX6-NEXT: s_sub_u32 s4, s4, s6
+; GFX6-NEXT: s_subb_u32 s5, s5, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0
-; GFX9-NEXT: s_ashr_i32 s2, s1, 31
-; GFX9-NEXT: s_add_u32 s0, s0, s2
-; GFX9-NEXT: s_mov_b32 s3, s2
-; GFX9-NEXT: s_addc_u32 s1, s1, s2
-; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3]
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT: s_sub_u32 s0, 0, s6
-; GFX9-NEXT: s_subb_u32 s1, 0, s7
+; GFX9-NEXT: s_ashr_i32 s6, s1, 31
+; GFX9-NEXT: s_add_u32 s0, s0, s6
+; GFX9-NEXT: s_mov_b32 s7, s6
+; GFX9-NEXT: s_addc_u32 s1, s1, s6
+; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7]
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_sub_u32 s10, 0, s8
+; GFX9-NEXT: s_subb_u32 s11, 0, s9
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -7949,130 +7982,122 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
-; GFX9-NEXT: s_mul_i32 s12, s0, s4
-; GFX9-NEXT: s_mul_hi_u32 s14, s0, s5
-; GFX9-NEXT: s_mul_i32 s13, s1, s5
-; GFX9-NEXT: s_add_i32 s12, s14, s12
-; GFX9-NEXT: s_mul_i32 s15, s0, s5
-; GFX9-NEXT: s_add_i32 s12, s12, s13
-; GFX9-NEXT: s_mul_hi_u32 s14, s5, s15
-; GFX9-NEXT: s_mul_hi_u32 s13, s5, s12
-; GFX9-NEXT: s_mul_i32 s5, s5, s12
-; GFX9-NEXT: s_add_u32 s5, s14, s5
+; GFX9-NEXT: v_readfirstlane_b32 s12, v2
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: s_mul_i32 s5, s10, s12
+; GFX9-NEXT: s_mul_hi_u32 s14, s10, s4
+; GFX9-NEXT: s_mul_i32 s13, s11, s4
+; GFX9-NEXT: s_add_i32 s5, s14, s5
+; GFX9-NEXT: s_mul_i32 s15, s10, s4
+; GFX9-NEXT: s_add_i32 s5, s5, s13
+; GFX9-NEXT: s_mul_hi_u32 s14, s4, s15
+; GFX9-NEXT: s_mul_i32 s16, s4, s5
+; GFX9-NEXT: s_mul_hi_u32 s13, s4, s5
+; GFX9-NEXT: s_add_u32 s14, s14, s16
; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_mul_hi_u32 s16, s4, s15
-; GFX9-NEXT: s_mul_i32 s15, s4, s15
-; GFX9-NEXT: s_add_u32 s5, s5, s15
-; GFX9-NEXT: s_mul_hi_u32 s14, s4, s12
-; GFX9-NEXT: s_addc_u32 s5, s13, s16
-; GFX9-NEXT: s_addc_u32 s13, s14, 0
-; GFX9-NEXT: s_mul_i32 s12, s4, s12
-; GFX9-NEXT: s_add_u32 s5, s5, s12
-; GFX9-NEXT: s_addc_u32 s12, 0, s13
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s5, v1
-; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_addc_u32 s4, s4, s12
-; GFX9-NEXT: v_readfirstlane_b32 s12, v1
-; GFX9-NEXT: s_mul_i32 s5, s0, s4
-; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12
-; GFX9-NEXT: s_add_i32 s5, s13, s5
-; GFX9-NEXT: s_mul_i32 s1, s1, s12
-; GFX9-NEXT: s_add_i32 s5, s5, s1
-; GFX9-NEXT: s_mul_i32 s0, s0, s12
-; GFX9-NEXT: s_mul_hi_u32 s13, s4, s0
-; GFX9-NEXT: s_mul_i32 s14, s4, s0
-; GFX9-NEXT: s_mul_i32 s16, s12, s5
-; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0
-; GFX9-NEXT: s_mul_hi_u32 s15, s12, s5
-; GFX9-NEXT: s_add_u32 s0, s0, s16
-; GFX9-NEXT: s_addc_u32 s12, 0, s15
-; GFX9-NEXT: s_add_u32 s0, s0, s14
-; GFX9-NEXT: s_mul_hi_u32 s1, s4, s5
-; GFX9-NEXT: s_addc_u32 s0, s12, s13
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: s_mul_i32 s5, s4, s5
-; GFX9-NEXT: s_add_u32 s0, s0, s5
-; GFX9-NEXT: s_addc_u32 s1, 0, s1
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1
-; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_addc_u32 s12, s4, s1
-; GFX9-NEXT: s_ashr_i32 s4, s11, 31
-; GFX9-NEXT: s_add_u32 s0, s10, s4
+; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15
+; GFX9-NEXT: s_mul_i32 s15, s12, s15
+; GFX9-NEXT: s_add_u32 s14, s14, s15
+; GFX9-NEXT: s_mul_hi_u32 s16, s12, s5
+; GFX9-NEXT: s_addc_u32 s13, s13, s17
+; GFX9-NEXT: s_addc_u32 s14, s16, 0
+; GFX9-NEXT: s_mul_i32 s5, s12, s5
+; GFX9-NEXT: s_add_u32 s5, s13, s5
+; GFX9-NEXT: s_addc_u32 s13, 0, s14
+; GFX9-NEXT: s_add_i32 s14, s4, s5
+; GFX9-NEXT: s_cselect_b64 s[4:5], 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s12, s12, s13
+; GFX9-NEXT: s_mul_i32 s4, s10, s12
+; GFX9-NEXT: s_mul_hi_u32 s5, s10, s14
+; GFX9-NEXT: s_add_i32 s4, s5, s4
+; GFX9-NEXT: s_mul_i32 s11, s11, s14
+; GFX9-NEXT: s_add_i32 s4, s4, s11
+; GFX9-NEXT: s_mul_i32 s10, s10, s14
+; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10
+; GFX9-NEXT: s_mul_i32 s13, s12, s10
+; GFX9-NEXT: s_mul_i32 s16, s14, s4
+; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10
+; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4
+; GFX9-NEXT: s_add_u32 s10, s10, s16
+; GFX9-NEXT: s_addc_u32 s15, 0, s15
+; GFX9-NEXT: s_add_u32 s10, s10, s13
+; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4
+; GFX9-NEXT: s_addc_u32 s10, s15, s11
+; GFX9-NEXT: s_addc_u32 s5, s5, 0
+; GFX9-NEXT: s_mul_i32 s4, s12, s4
+; GFX9-NEXT: s_add_u32 s4, s10, s4
+; GFX9-NEXT: s_addc_u32 s10, 0, s5
+; GFX9-NEXT: s_add_i32 s14, s14, s4
+; GFX9-NEXT: s_cselect_b64 s[4:5], 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s10, s12, s10
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_ashr_i32 s4, s3, 31
+; GFX9-NEXT...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/159814
More information about the llvm-commits
mailing list