[PATCH] D16253: AMDGPU: Generalize shl combine

Fri Jan 15 19:37:37 PST 2016

arsenm created this revision.
arsenm added a reviewer: tstellarAMD.
arsenm added a subscriber: llvm-commits.
Herald added a subscriber: arsenm.

Reduce 64-bit shl with constant > 32. We already special cased
this for the == 32 case, but this also works for any >= 32 constant.


http://reviews.llvm.org/D16253

Files:
  lib/Target/AMDGPU/AMDGPUISelLowering.cpp
  test/CodeGen/AMDGPU/shift-i64-opts.ll

Index: test/CodeGen/AMDGPU/shift-i64-opts.ll
===================================================================

--- test/CodeGen/AMDGPU/shift-i64-opts.ll
+++ test/CodeGen/AMDGPU/shift-i64-opts.ll
@@ -62,3 +62,50 @@
   store i64 %shl, i64 addrspace(1)* %out
   ret void
 }
+
+; lshl (i64 x), c: c > 32 => reg_sequence lshl 0, (i32 lo_32(x)), (c - 32)
+
+; GCN-LABEL: {{^}}shl_i64_const_35:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 3, [[VAL]]
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = shl i64 %val, 35
+  store i64 %shl, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}shl_i64_const_32:
+; GCN: buffer_load_dword v[[HI:[0-9]+]]
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = shl i64 %val, 32
+  store i64 %shl, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}shl_i64_const_63:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 31, [[VAL]]
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = shl i64 %val, 63
+  store i64 %shl, i64 addrspace(1)* %out
+  ret void
+}
+
+; ashr (i64 x), 63 => (ashr lo(x), 31), lo(x)
+
+; GCN-LABEL: {{^}}ashr_i64_const_gt_32:
+define void @ashr_i64_const_gt_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = ashr i64 %val, 35
+  store i64 %shl, i64 addrspace(1)* %out
+  ret void
+}
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2546,25 +2546,30 @@
 
   // i64 (shl x, 32) -> (build_pair 0, x)
 
-  // Doing this with moves theoretically helps MI optimizations that understand
-  // copies. 2 v_mov_b32_e32 will have the same code size / cycle count as
-  // v_lshl_b64. In the SALU case, I think this is slightly worse since it
-  // doubles the code size and I'm unsure about cycle count.
+  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
+  // common case, splitting this into a move and a 32-bit shift is faster and
+  // the same code size.
   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
-  if (!RHS || RHS->getZExtValue() != 32)
+  if (!RHS)
+    return SDValue();
+
+  unsigned RHSVal = RHS->getZExtValue();
+  if (RHSVal < 32)
     return SDValue();
 
   SDValue LHS = N->getOperand(0);
 
   SDLoc SL(N);
   SelectionDAG &DAG = DCI.DAG;
 
-  // Extract low 32-bits.
+  SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
+
   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+  SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
 
   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
 
-  SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Zero, Lo);
+  SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Zero, NewShift);
   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
 }
 


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D16253.45066.patch
Type: text/x-patch
Size: 3517 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20160116/1bd460e9/attachment-0001.bin>