[llvm] 2da6ef3 - [AMDGPU] Add 24-bit mulhi intrinsics in INTRINSIC_WO_CHAIN combine.

Thu Oct 28 04:28:46 PDT 2021

Author: Abinav Puthan Purayil
Date: 2021-10-28T16:57:48+05:30
New Revision: 2da6ef3664333cc36c449b271c50a72dd7f61940

URL: https://github.com/llvm/llvm-project/commit/2da6ef3664333cc36c449b271c50a72dd7f61940
DIFF: https://github.com/llvm/llvm-project/commit/2da6ef3664333cc36c449b271c50a72dd7f61940.diff

LOG: [AMDGPU] Add 24-bit mulhi intrinsics in INTRINSIC_WO_CHAIN combine.

mul24 intrinsic's operands are simplified by
AMDGPUTargetLowering::performIntrinsicWOChainCombine(). This change adds
the mul24hi intrinsics in the combine since its operands can be
simplified like that of the mul24 intrinsics.

Differential Revision: https://reviews.llvm.org/D112702

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 49abb1e00890..1632362109fd 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2897,8 +2897,22 @@ static SDValue simplifyMul24(SDNode *Node24,
   unsigned NewOpcode = Node24->getOpcode();
   if (IsIntrin) {
     unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
-    NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
-      AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
+    switch (IID) {
+    case Intrinsic::amdgcn_mul_i24:
+      NewOpcode = AMDGPUISD::MUL_I24;
+      break;
+    case Intrinsic::amdgcn_mul_u24:
+      NewOpcode = AMDGPUISD::MUL_U24;
+      break;
+    case Intrinsic::amdgcn_mulhi_i24:
+      NewOpcode = AMDGPUISD::MULHI_I24;
+      break;
+    case Intrinsic::amdgcn_mulhi_u24:
+      NewOpcode = AMDGPUISD::MULHI_U24;
+      break;
+    default:
+      llvm_unreachable("Expected 24-bit mul intrinsic");
+    }
   }
 
   APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
@@ -3107,6 +3121,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
   switch (IID) {
   case Intrinsic::amdgcn_mul_i24:
   case Intrinsic::amdgcn_mul_u24:
+  case Intrinsic::amdgcn_mulhi_i24:
+  case Intrinsic::amdgcn_mulhi_u24:
     return simplifyMul24(N, DCI);
   case Intrinsic::amdgcn_fract:
   case Intrinsic::amdgcn_rsq:

diff  --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index 7c15e731a555..eaa45b929b2b 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -575,11 +575,9 @@ define i64 @test_umul48_i64(i64 %lhs, i64 %rhs) {
 ; GCN-LABEL: test_umul48_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, 0xffffff
-; GCN-NEXT:    v_and_b32_e32 v1, s4, v0
-; GCN-NEXT:    v_and_b32_e32 v3, s4, v2
-; GCN-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
-; GCN-NEXT:    v_mul_hi_u32_u24_e32 v1, v1, v3
+; GCN-NEXT:    v_mul_u32_u24_e32 v3, v0, v2
+; GCN-NEXT:    v_mul_hi_u32_u24_e32 v1, v0, v2
+; GCN-NEXT:    v_mov_b32_e32 v0, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %lhs24 = and i64 %lhs, 16777215
   %rhs24 = and i64 %rhs, 16777215
@@ -588,49 +586,16 @@ define i64 @test_umul48_i64(i64 %lhs, i64 %rhs) {
 }
 
 define <2 x i64> @test_umul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; SI-LABEL: test_umul48_v2i64:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, 0xffffff
-; SI-NEXT:    v_mul_u32_u24_e32 v5, v0, v4
-; SI-NEXT:    v_mul_u32_u24_e32 v7, v2, v6
-; SI-NEXT:    v_and_b32_e32 v2, s4, v2
-; SI-NEXT:    v_and_b32_e32 v0, s4, v0
-; SI-NEXT:    v_and_b32_e32 v3, s4, v6
-; SI-NEXT:    v_and_b32_e32 v1, s4, v4
-; SI-NEXT:    v_mul_hi_u32_u24_e32 v1, v0, v1
-; SI-NEXT:    v_mul_hi_u32_u24_e32 v3, v2, v3
-; SI-NEXT:    v_mov_b32_e32 v0, v5
-; SI-NEXT:    v_mov_b32_e32 v2, v7
-; SI-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-LABEL: test_umul48_v2i64:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, 0xffffff
-; VI-NEXT:    v_and_b32_e32 v3, s4, v2
-; VI-NEXT:    v_and_b32_e32 v1, s4, v0
-; VI-NEXT:    v_and_b32_e32 v5, s4, v6
-; VI-NEXT:    v_and_b32_e32 v7, s4, v4
-; VI-NEXT:    v_mul_u32_u24_e32 v0, v0, v4
-; VI-NEXT:    v_mul_hi_u32_u24_e32 v1, v1, v7
-; VI-NEXT:    v_mul_u32_u24_e32 v2, v2, v6
-; VI-NEXT:    v_mul_hi_u32_u24_e32 v3, v3, v5
-; VI-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_umul48_v2i64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0xffffff
-; GFX9-NEXT:    v_and_b32_e32 v3, s4, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, s4, v0
-; GFX9-NEXT:    v_and_b32_e32 v5, s4, v6
-; GFX9-NEXT:    v_and_b32_e32 v7, s4, v4
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, v1, v7
-; GFX9-NEXT:    v_mul_u32_u24_e32 v2, v2, v6
-; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v3, v3, v5
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_umul48_v2i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_u32_u24_e32 v5, v0, v4
+; GCN-NEXT:    v_mul_hi_u32_u24_e32 v1, v0, v4
+; GCN-NEXT:    v_mul_u32_u24_e32 v4, v2, v6
+; GCN-NEXT:    v_mul_hi_u32_u24_e32 v3, v2, v6
+; GCN-NEXT:    v_mov_b32_e32 v0, v5
+; GCN-NEXT:    v_mov_b32_e32 v2, v4
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %lhs24 = and <2 x i64> %lhs, <i64 16777215, i64 16777215>
   %rhs24 = and <2 x i64> %rhs, <i64 16777215, i64 16777215>
   %mul = mul <2 x i64> %lhs24, %rhs24