[llvm] r242177 - AMDGPU: Avoid using 64-bit shift for i64 (shl x, 32)

Tue Jul 14 11:20:33 PDT 2015

Author: arsenm
Date: Tue Jul 14 13:20:33 2015
New Revision: 242177

URL: http://llvm.org/viewvc/llvm-project?rev=242177&view=rev
Log:
AMDGPU: Avoid using 64-bit shift for i64 (shl x, 32)

This can be done only with moves which theoretically
will optimize better later.

Although this transform increases the instruction count,
it should be code size / cycle count neutral in the worst
VALU case. It also seems to slightly improve a couple
of testcases due to other DAG combines this exposes.

This is probably slightly worse for the SALU case, so
it might be better to handle this during moveToVALU,
although then you lose some simplifications like
the load width reducing in the simple testcase.

Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
    llvm/trunk/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
    llvm/trunk/test/CodeGen/AMDGPU/mul_uint24.ll
    llvm/trunk/test/CodeGen/AMDGPU/shl.ll
    llvm/trunk/test/CodeGen/AMDGPU/srl.ll

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=242177&r1=242176&r2=242177&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Tue Jul 14 13:20:33 2015
@@ -406,6 +406,7 @@ AMDGPUTargetLowering::AMDGPUTargetLoweri
   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
 
+  setTargetDAGCombine(ISD::SHL);
   setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::SELECT);
   setTargetDAGCombine(ISD::SELECT_CC);
@@ -2415,6 +2416,33 @@ SDValue AMDGPUTargetLowering::performSto
                       SN->getBasePtr(), SN->getMemOperand());
 }
 
+SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  if (N->getValueType(0) != MVT::i64)
+    return SDValue();
+
+  // i64 (shl x, 32) -> (build_pair 0, x)
+
+  // Doing this with moves theoretically helps MI optimizations that understand
+  // copies. 2 v_mov_b32_e32 will have the same code size / cycle count as
+  // v_lshl_b64. In the SALU case, I think this is slightly worse since it
+  // doubles the code size and I'm unsure about cycle count.
+  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!RHS || RHS->getZExtValue() != 32)
+    return SDValue();
+
+  SDValue LHS = N->getOperand(0);
+
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
+
+  // Extract low 32-bits.
+  SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+
+  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Zero, Lo);
+}
+
 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
   EVT VT = N->getValueType(0);
@@ -2454,6 +2482,12 @@ SDValue AMDGPUTargetLowering::PerformDAG
   switch(N->getOpcode()) {
   default:
     break;
+  case ISD::SHL: {
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+      break;
+
+    return performShlCombine(N, DCI);
+  }
   case ISD::MUL:
     return performMulCombine(N, DCI);
   case AMDGPUISD::MUL_I24:

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h?rev=242177&r1=242176&r2=242177&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h Tue Jul 14 13:20:33 2015
@@ -65,6 +65,7 @@ private:
   SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
 protected:

Modified: llvm/trunk/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll?rev=242177&r1=242176&r2=242177&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll Tue Jul 14 13:20:33 2015
@@ -3,8 +3,9 @@
 declare i32 @llvm.SI.tid() readnone
 
 ; SI-LABEL: {{^}}test_array_ptr_calc:
-; SI: v_mul_lo_i32
-; SI: v_mul_hi_i32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_i32
+; SI: s_endpgm
 define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.SI.tid() readnone
   %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0

Modified: llvm/trunk/test/CodeGen/AMDGPU/mul_uint24.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/mul_uint24.ll?rev=242177&r1=242176&r2=242177&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/mul_uint24.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/mul_uint24.ll Tue Jul 14 13:20:33 2015
@@ -52,16 +52,18 @@ entry:
 ; FUNC_LABEL: {{^}}mul24_i64:
 ; EG; MUL_UINT24
 ; EG: MULHI
-; SI: v_mul_u32_u24
 ; FIXME: SI support 24-bit mulhi
-; SI: v_mul_hi_u32
-define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+
+; SI-DAG: v_mul_u32_u24
+; SI-DAG: v_mul_hi_u32
+; SI: s_endpgm
+define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
 entry:
-  %0 = shl i64 %a, 40
-  %a_24 = lshr i64 %0, 40
-  %1 = shl i64 %b, 40
-  %b_24 = lshr i64 %1, 40
-  %2 = mul i64 %a_24, %b_24
-  store i64 %2, i64 addrspace(1)* %out
+  %tmp0 = shl i64 %a, 40
+  %a_24 = lshr i64 %tmp0, 40
+  %tmp1 = shl i64 %b, 40
+  %b_24 = lshr i64 %tmp1, 40
+  %tmp2 = mul i64 %a_24, %b_24
+  store i64 %tmp2, i64 addrspace(1)* %out
   ret void
 }

Modified: llvm/trunk/test/CodeGen/AMDGPU/shl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/shl.ll?rev=242177&r1=242176&r2=242177&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/shl.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/shl.ll Tue Jul 14 13:20:33 2015
@@ -1,6 +1,9 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; XUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
 
 ;EG: {{^}}shl_v2i32:
 ;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@@ -178,3 +181,32 @@ define void @shl_v4i64(<4 x i64> addrspa
   store <4 x i64> %result, <4 x i64> addrspace(1)* %out
   ret void
 }
+
+; Make sure load width gets reduced to i32 load.
+; GCN-LABEL: {{^}}s_shl_32_i64:
+; GCN-DAG: s_load_dword [[LO_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}}
+; GCN-DAG: s_mov_b32 s[[SLO:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[LO_A]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
+  %result = shl i64 %a, 32
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_shl_32_i64:
+; GCN-DAG: buffer_load_dword v[[LO_A:[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[LO_A]]{{\]}}
+define void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %gep.in
+  %result = shl i64 %a, 32
+  store i64 %result, i64 addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }

Modified: llvm/trunk/test/CodeGen/AMDGPU/srl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/srl.ll?rev=242177&r1=242176&r2=242177&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/srl.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/srl.ll Tue Jul 14 13:20:33 2015
@@ -1,7 +1,9 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
+declare i32 @llvm.r600.read.tidig.x() #0
+
 ; FUNC-LABEL: {{^}}lshr_i32:
 ; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
@@ -184,3 +186,32 @@ define void @lshr_v4i64(<4 x i64> addrsp
   store <4 x i64> %result, <4 x i64> addrspace(1)* %out
   ret void
 }
+
+; Make sure load width gets reduced to i32 load.
+; GCN-LABEL: {{^}}s_lshr_32_i64:
+; GCN-DAG: s_load_dword [[HI_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc{{$}}
+; GCN-DAG: s_mov_b32 s[[SHI:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[HI_A]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
+  %result = lshr i64 %a, 32
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_lshr_32_i64:
+; GCN-DAG: buffer_load_dword v[[HI_A:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[HI_A]]:[[VHI]]{{\]}}
+define void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %gep.in
+  %result = lshr i64 %a, 32
+  store i64 %result, i64 addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }