[llvm] e438f2d - [DAGCombine] Do not fold SRA/SRL of MUL into MULH when MUL's LSB are

Fri Sep 16 08:49:00 PDT 2022

Author: Juan Manuel MARTINEZ CAAMAÑO
Date: 2022-09-16T15:48:36Z
New Revision: e438f2d821119439c2b05b2fef7617064a6233bc

URL: https://github.com/llvm/llvm-project/commit/e438f2d821119439c2b05b2fef7617064a6233bc
DIFF: https://github.com/llvm/llvm-project/commit/e438f2d821119439c2b05b2fef7617064a6233bc.diff

LOG: [DAGCombine] Do not fold SRA/SRL of MUL into MULH when MUL's LSB are
used, and MUL_LOHI is available

Folding into a sra(mul) / srl(mul) into a mulh introduces an extra
multiplication to compute the high half of the multiplication,
while it is more profitable to compute the high and lower halfs with a
single mul_lohi.

Differential Revision: https://reviews.llvm.org/D133768

Added: 
    llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4354a9191fefb..7a2e89eb01799 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9244,6 +9244,28 @@ static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG,
   EVT NarrowVT = LeftOp.getOperand(0).getValueType();
   unsigned NarrowVTSize = NarrowVT.getScalarSizeInBits();
 
+  // return true if U may use the lower bits of its operands
+  auto UserOfLowerBits = [NarrowVTSize](SDNode *U) {
+    if (U->getOpcode() != ISD::SRL || U->getOpcode() != ISD::SRA) {
+      return true;
+    }
+    ConstantSDNode *UShiftAmtSrc = isConstOrConstSplat(U->getOperand(1));
+    if (!UShiftAmtSrc) {
+      return true;
+    }
+    unsigned UShiftAmt = UShiftAmtSrc->getZExtValue();
+    return UShiftAmt < NarrowVTSize;
+  };
+
+  // If the lower part of the MUL is also used and MUL_LOHI is supported
+  // do not introduce the MULH in favor of MUL_LOHI
+  unsigned MulLoHiOp = IsSignExt ? ISD::SMUL_LOHI : ISD::UMUL_LOHI;
+  if (!ShiftOperand.hasOneUse() &&
+      TLI.isOperationLegalOrCustom(MulLoHiOp, NarrowVT) &&
+      llvm::any_of(ShiftOperand->uses(), UserOfLowerBits)) {
+    return SDValue();
+  }
+
   SDValue MulhRightOp;
   if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) {
     unsigned ActiveBits = IsSignExt

diff  --git a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll
new file mode 100644
index 0000000000000..ad04afe1698b6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s
+
+define i32 @mullohi_u32(i32 %arg, i32 %arg1, i32* %arg2) {
+; CHECK-LABEL: mullohi_u32:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v0, 0
+; CHECK-NEXT:    flat_store_dword v[2:3], v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %i = zext i32 %arg to i64
+  %i3 = zext i32 %arg1 to i64
+  %i4 = mul nuw i64 %i3, %i
+  %i5 = lshr i64 %i4, 32
+  %i6 = trunc i64 %i5 to i32
+  store i32 %i6, i32* %arg2, align 4
+  %i7 = trunc i64 %i4 to i32
+  ret i32 %i7
+}
+
+define i32 @mullohi_s32(i32 %arg, i32 %arg1, i32* %arg2) {
+; CHECK-LABEL: mullohi_s32:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v1, v0, 0
+; CHECK-NEXT:    flat_store_dword v[2:3], v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %i = sext i32 %arg to i64
+  %i3 = sext i32 %arg1 to i64
+  %i4 = mul nsw i64 %i3, %i
+  %i5 = ashr i64 %i4, 32
+  %i6 = trunc i64 %i5 to i32
+  store i32 %i6, i32* %arg2, align 4
+  %i7 = trunc i64 %i4 to i32
+  ret i32 %i7
+}
+
+define i32 @mullohi_u32_non_const_shift(i32 %arg, i32 %arg1, i32* %arg2, i64 %shift) {
+; CHECK-LABEL: mullohi_u32_non_const_shift:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v1, v0, 0
+; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v4, v[5:6]
+; CHECK-NEXT:    flat_store_dword v[2:3], v6
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %i = zext i32 %arg to i64
+  %i3 = zext i32 %arg1 to i64
+  %i4 = mul nuw i64 %i3, %i
+  %i5 = lshr i64 %i4, 32
+  %i6 = trunc i64 %i5 to i32
+  store i32 %i6, i32* %arg2, align 4
+  %i7 = lshr i64 %i4, %shift
+  %i8 = trunc i64 %i7 to i32
+  ret i32 %i8
+}
+
+define <2 x i32> @mullohi_2xu32(<2 x i32> %arg, <2 x i32> %arg1, <2 x i32>* %arg2) {
+; CHECK-LABEL: mullohi_2xu32:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v6, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, v6, 0
+; CHECK-NEXT:    v_mov_b32_e32 v6, v1
+; CHECK-NEXT:    v_mov_b32_e32 v7, v3
+; CHECK-NEXT:    v_mov_b32_e32 v1, v2
+; CHECK-NEXT:    flat_store_dwordx2 v[4:5], v[6:7]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %i = zext <2 x i32> %arg to <2 x i64>
+  %i3 = zext <2 x i32> %arg1 to <2 x i64>
+  %i4 = mul nuw <2 x i64> %i3, %i
+  %i5 = lshr <2 x i64> %i4, <i64 32, i64 32>
+  %i6 = trunc <2 x i64> %i5 to <2 x i32>
+  store <2 x i32> %i6, <2 x i32>* %arg2, align 8
+  %i7 = trunc <2 x i64> %i4 to <2 x i32>
+  ret <2 x i32> %i7
+}
+
+define i8 @mullohi_illegal_ty(i8 %arg, i8 %arg1, i8* %arg2) {
+; CHECK-LABEL: mullohi_illegal_ty:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; CHECK-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
+; CHECK-NEXT:    flat_store_byte v[2:3], v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %i = zext i8 %arg to i16
+  %i3 = zext i8 %arg1 to i16
+  %i4 = mul nuw i16 %i3, %i
+  %i5 = lshr i16 %i4, 8
+  %i6 = trunc i16 %i5 to i8
+  store i8 %i6, i8* %arg2, align 1
+  %i7 = trunc i16 %i4 to i8
+  ret i8 %i7
+}
+
+define i32 @mul_one_bit_low_hi_u32(i32 %arg, i32 %arg1, i32* %arg2) {
+; CHECK-LABEL: mul_one_bit_low_hi_u32:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v0, 0
+; CHECK-NEXT:    v_alignbit_b32 v0, v1, v0, 31
+; CHECK-NEXT:    flat_store_dword v[2:3], v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %i = zext i32 %arg to i64
+  %i3 = zext i32 %arg1 to i64
+  %i4 = mul nsw i64 %i3, %i
+  %i5 = lshr i64 %i4, 32
+  %i6 = trunc i64 %i5 to i32
+  store i32 %i6, i32* %arg2, align 4
+  %i7 = lshr i64 %i4, 31
+  %i8 = trunc i64 %i7 to i32
+  ret i32 %i8
+}
+
+define i32 @mul_one_bit_hi_hi_u32(i32 %arg, i32 %arg1, i32* %arg2) {
+; CHECK-LABEL: mul_one_bit_hi_hi_u32:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mul_hi_u32 v0, v1, v0
+; CHECK-NEXT:    flat_store_dword v[2:3], v0
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %i = zext i32 %arg to i64
+  %i3 = zext i32 %arg1 to i64
+  %i4 = mul nsw i64 %i3, %i
+  %i5 = lshr i64 %i4, 32
+  %i6 = trunc i64 %i5 to i32
+  store i32 %i6, i32* %arg2, align 4
+  %i7 = lshr i64 %i4, 33
+  %i8 = trunc i64 %i7 to i32
+  ret i32 %i8
+}