[llvm] [AMDGPU] Fix sign confusion in performMulLoHiCombine (PR #105831)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 23 06:42:21 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Jay Foad (jayfoad)
<details>
<summary>Changes</summary>
SMUL_LOHI and UMUL_LOHI are different operations because the high part
of the result is different, so it is not OK to optimize the signed
version to MUL_U24/MULHI_U24 or the unsigned version to
MUL_I24/MULHI_I24.
---
Full diff: https://github.com/llvm/llvm-project/pull/105831.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+18-12)
- (modified) llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll (+56-52)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e57c8f8b7b4835..96143d688801aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4346,6 +4346,7 @@ AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
+ bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -4360,20 +4361,25 @@ AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
// Try to use two fast 24-bit multiplies (one for each half of the result)
// instead of one slow extending multiply.
- unsigned LoOpcode, HiOpcode;
- if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
- N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
- N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
- LoOpcode = AMDGPUISD::MUL_U24;
- HiOpcode = AMDGPUISD::MULHI_U24;
- } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
- N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
- N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
- LoOpcode = AMDGPUISD::MUL_I24;
- HiOpcode = AMDGPUISD::MULHI_I24;
+ unsigned LoOpcode = 0;
+ unsigned HiOpcode = 0;
+ if (Signed) {
+ if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
+ N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
+ N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
+ LoOpcode = AMDGPUISD::MUL_I24;
+ HiOpcode = AMDGPUISD::MULHI_I24;
+ }
} else {
- return SDValue();
+ if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
+ N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
+ N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
+ LoOpcode = AMDGPUISD::MUL_U24;
+ HiOpcode = AMDGPUISD::MULHI_U24;
+ }
}
+ if (!LoOpcode)
+ return SDValue();
SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
diff --git a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
index 113c6d01c99a16..4143c65a840d71 100644
--- a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
@@ -1052,21 +1052,22 @@ define noundef i64 @srem64_i32max(i64 noundef %i) {
; GFX9-NEXT: s_mov_b32 s6, 0x80000001
; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v1
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, 3, v[2:3]
-; GFX9-NEXT: v_mul_i32_i24_e32 v2, 3, v6
-; GFX9-NEXT: v_mul_hi_i32_i24_e32 v7, 3, v6
-; GFX9-NEXT: v_mov_b32_e32 v8, v5
+; GFX9-NEXT: v_mul_i32_i24_e32 v8, 3, v6
+; GFX9-NEXT: v_lshl_add_u32 v9, v6, 31, v6
+; GFX9-NEXT: v_mov_b32_e32 v10, v5
; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, s6, v[4:5]
-; GFX9-NEXT: v_lshl_add_u32 v6, v6, 31, v6
-; GFX9-NEXT: v_add3_u32 v3, v7, v6, v2
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, -1, v[2:3]
-; GFX9-NEXT: v_addc_co_u32_e64 v5, s[4:5], 0, 0, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s6, v[4:5]
-; GFX9-NEXT: v_sub_u32_e32 v3, v3, v1
-; GFX9-NEXT: v_sub_u32_e32 v3, v3, v0
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5]
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, 3, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2
+; GFX9-NEXT: v_add3_u32 v7, v7, v9, v8
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, -1, v[6:7]
+; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3]
+; GFX9-NEXT: v_sub_u32_e32 v5, v5, v1
+; GFX9-NEXT: v_sub_u32_e32 v5, v5, v0
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, 1, v[2:3]
; GFX9-NEXT: s_brev_b32 s6, -2
; GFX9-NEXT: v_add_u32_e32 v3, v1, v3
@@ -1083,11 +1084,11 @@ define noundef i64 @srem64_i32max(i64 noundef %i) {
; GFX942-LABEL: srem64_i32max:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v1
-; GFX942-NEXT: v_mul_i32_i24_e32 v2, 3, v3
-; GFX942-NEXT: v_mul_hi_i32_i24_e32 v4, 3, v3
-; GFX942-NEXT: v_lshl_add_u32 v3, v3, 31, v3
-; GFX942-NEXT: v_add3_u32 v3, v4, v3, v2
+; GFX942-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-NEXT: v_mul_i32_i24_e32 v4, 3, v2
+; GFX942-NEXT: v_lshl_add_u32 v5, v2, 31, v2
+; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, 3, 0
+; GFX942-NEXT: v_add3_u32 v3, v3, v5, v4
; GFX942-NEXT: v_mul_hi_u32 v4, v0, 3
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v1, 3, v[4:5]
@@ -1124,16 +1125,17 @@ define noundef i64 @srem64_i32max(i64 noundef %i) {
; GFX1030-NEXT: v_mul_hi_u32 v2, v0, 3
; GFX1030-NEXT: v_mov_b32_e32 v3, 0
; GFX1030-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; GFX1030-NEXT: v_mul_hi_i32_i24_e32 v8, 3, v6
+; GFX1030-NEXT: v_mul_i32_i24_e32 v7, 3, v6
; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v1, 3, v[2:3]
-; GFX1030-NEXT: v_mul_i32_i24_e32 v2, 3, v6
-; GFX1030-NEXT: v_lshl_add_u32 v6, v6, 31, v6
-; GFX1030-NEXT: v_mov_b32_e32 v7, v5
+; GFX1030-NEXT: v_mov_b32_e32 v8, v5
; GFX1030-NEXT: v_mov_b32_e32 v5, v3
-; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x80000001, v0, v[4:5]
-; GFX1030-NEXT: v_add3_u32 v3, v8, v6, v2
+; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v6, 3, 0
+; GFX1030-NEXT: v_lshl_add_u32 v6, v6, 31, v6
+; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0x80000001, v0, v[4:5]
+; GFX1030-NEXT: v_add3_u32 v3, v3, v6, v7
+; GFX1030-NEXT: v_mov_b32_e32 v4, v5
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, -1, v[2:3]
-; GFX1030-NEXT: v_add_co_u32 v4, s4, v7, v4
+; GFX1030-NEXT: v_add_co_u32 v4, s4, v8, v4
; GFX1030-NEXT: v_add_co_ci_u32_e64 v5, null, 0, 0, s4
; GFX1030-NEXT: v_sub_nc_u32_e32 v6, v3, v1
; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x80000001, v1, v[4:5]
@@ -1165,21 +1167,22 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i) {
; GFX9-NEXT: s_mov_b32 s6, 0x80000001
; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v1
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, 3, v[2:3]
-; GFX9-NEXT: v_mul_i32_i24_e32 v2, 3, v6
-; GFX9-NEXT: v_mul_hi_i32_i24_e32 v7, 3, v6
-; GFX9-NEXT: v_mov_b32_e32 v8, v5
+; GFX9-NEXT: v_mul_i32_i24_e32 v8, 3, v6
+; GFX9-NEXT: v_lshl_add_u32 v9, v6, 31, v6
+; GFX9-NEXT: v_mov_b32_e32 v10, v5
; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, s6, v[4:5]
-; GFX9-NEXT: v_lshl_add_u32 v6, v6, 31, v6
-; GFX9-NEXT: v_add3_u32 v3, v7, v6, v2
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, -1, v[2:3]
-; GFX9-NEXT: v_addc_co_u32_e64 v5, s[4:5], 0, 0, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s6, v[4:5]
-; GFX9-NEXT: v_sub_u32_e32 v3, v3, v1
-; GFX9-NEXT: v_sub_u32_e32 v3, v3, v0
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5]
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, 3, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2
+; GFX9-NEXT: v_add3_u32 v7, v7, v9, v8
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, -1, v[6:7]
+; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3]
+; GFX9-NEXT: v_sub_u32_e32 v5, v5, v1
+; GFX9-NEXT: v_sub_u32_e32 v5, v5, v0
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, 1, v[2:3]
; GFX9-NEXT: v_add_u32_e32 v3, v1, v3
; GFX9-NEXT: v_ashrrev_i64 v[0:1], 30, v[2:3]
@@ -1191,11 +1194,11 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i) {
; GFX942-LABEL: sdiv64_i32max:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v1
-; GFX942-NEXT: v_mul_i32_i24_e32 v2, 3, v3
-; GFX942-NEXT: v_mul_hi_i32_i24_e32 v4, 3, v3
-; GFX942-NEXT: v_lshl_add_u32 v3, v3, 31, v3
-; GFX942-NEXT: v_add3_u32 v3, v4, v3, v2
+; GFX942-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-NEXT: v_mul_i32_i24_e32 v4, 3, v2
+; GFX942-NEXT: v_lshl_add_u32 v5, v2, 31, v2
+; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, 3, 0
+; GFX942-NEXT: v_add3_u32 v3, v3, v5, v4
; GFX942-NEXT: v_mul_hi_u32 v4, v0, 3
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v1, 3, v[4:5]
@@ -1224,16 +1227,17 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i) {
; GFX1030-NEXT: v_mul_hi_u32 v2, v0, 3
; GFX1030-NEXT: v_mov_b32_e32 v3, 0
; GFX1030-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; GFX1030-NEXT: v_mul_hi_i32_i24_e32 v8, 3, v6
+; GFX1030-NEXT: v_mul_i32_i24_e32 v7, 3, v6
; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v1, 3, v[2:3]
-; GFX1030-NEXT: v_mul_i32_i24_e32 v2, 3, v6
-; GFX1030-NEXT: v_lshl_add_u32 v6, v6, 31, v6
-; GFX1030-NEXT: v_mov_b32_e32 v7, v5
+; GFX1030-NEXT: v_mov_b32_e32 v8, v5
; GFX1030-NEXT: v_mov_b32_e32 v5, v3
-; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x80000001, v0, v[4:5]
-; GFX1030-NEXT: v_add3_u32 v3, v8, v6, v2
+; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v6, 3, 0
+; GFX1030-NEXT: v_lshl_add_u32 v6, v6, 31, v6
+; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0x80000001, v0, v[4:5]
+; GFX1030-NEXT: v_add3_u32 v3, v3, v6, v7
+; GFX1030-NEXT: v_mov_b32_e32 v4, v5
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, -1, v[2:3]
-; GFX1030-NEXT: v_add_co_u32 v4, s4, v7, v4
+; GFX1030-NEXT: v_add_co_u32 v4, s4, v8, v4
; GFX1030-NEXT: v_add_co_ci_u32_e64 v5, null, 0, 0, s4
; GFX1030-NEXT: v_sub_nc_u32_e32 v6, v3, v1
; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x80000001, v1, v[4:5]
``````````
</details>
https://github.com/llvm/llvm-project/pull/105831
More information about the llvm-commits
mailing list