[llvm] r331916 - AMDGPU: Partially shrink 64-bit shifts if reduced to 16-bit

Wed May 9 13:52:44 PDT 2018

Author: arsenm
Date: Wed May  9 13:52:43 2018
New Revision: 331916

URL: http://llvm.org/viewvc/llvm-project?rev=331916&view=rev
Log:
AMDGPU: Partially shrink 64-bit shifts if reduced to 16-bit

This is an extension of an existing combine to reduce wider
shls if the result fits in the final result type. This
introduces the same combine, but reduces the shift to a middle
sized type to avoid the slow 64-bit shift.

Added:
    llvm/trunk/test/CodeGen/AMDGPU/partial-shift-shrink.ll
Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=331916&r1=331915&r2=331916&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Wed May  9 13:52:43 2018
@@ -3144,6 +3144,36 @@ SDValue AMDGPUTargetLowering::performTru
     }
   }
 
+  // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
+  //
+  // i16 (trunc (srl i64:x, K)), K <= 16 ->
+  //     i16 (trunc (srl (i32 (trunc x), K)))
+  if (VT.getScalarSizeInBits() < 32) {
+    EVT SrcVT = Src.getValueType();
+    if (SrcVT.getScalarSizeInBits() > 32 &&
+        (Src.getOpcode() == ISD::SRL ||
+         Src.getOpcode() == ISD::SRA ||
+         Src.getOpcode() == ISD::SHL)) {
+      if (auto ShiftAmount = isConstOrConstSplat(Src.getOperand(1))) {
+        if (ShiftAmount->getZExtValue() <= VT.getScalarSizeInBits()) {
+          EVT MidVT = VT.isVector() ?
+            EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                             VT.getVectorNumElements()) : MVT::i32;
+
+          EVT ShiftTy = getShiftAmountTy(MidVT, DAG.getDataLayout());
+          SDValue NewShiftAmt = DAG.getConstant(ShiftAmount->getZExtValue(),
+                                                SL, ShiftTy);
+          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
+                                      Src.getOperand(0));
+          DCI.AddToWorklist(Trunc.getNode());
+          SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
+                                            Trunc, NewShiftAmt);
+          return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
+        }
+      }
+    }
+  }
+
   return SDValue();
 }
 

Added: llvm/trunk/test/CodeGen/AMDGPU/partial-shift-shrink.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/partial-shift-shrink.ll?rev=331916&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/partial-shift-shrink.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/partial-shift-shrink.ll Wed May  9 13:52:43 2018
@@ -0,0 +1,102 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+
+; Test combine to reduce the width of a 64-bit shift to 32-bit if
+; truncated to 16-bit.
+
+; GCN-LABEL: {{^}}trunc_srl_i64_16_to_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_srl_i64_16_to_i16(i64 %x) {
+  %shift = lshr i64 %x, 16
+  %trunc = trunc i64 %shift to i16
+  ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_srl_i64_17_to_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_lshrrev_b64 v[0:1], 17, v[0:1]
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_srl_i64_17_to_i16(i64 %x) {
+  %shift = lshr i64 %x, 17
+  %trunc = trunc i64 %shift to i16
+  ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_srl_i55_16_to_i15:
+; GCN: s_waitcnt
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 15, v0
+; GCN-NEXT: v_add_u16_e32 v0, 4, v0
+; GCN-NEXT: s_setpc_b64
+define i15 @trunc_srl_i55_16_to_i15(i55 %x) {
+  %shift = lshr i55 %x, 15
+  %trunc = trunc i55 %shift to i15
+  %add = add i15 %trunc, 4
+  ret i15 %add
+}
+
+; GCN-LABEL: {{^}}trunc_sra_i64_16_to_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_sra_i64_16_to_i16(i64 %x) {
+  %shift = ashr i64 %x, 16
+  %trunc = trunc i64 %shift to i16
+  ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_sra_i64_17_to_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_lshrrev_b64 v[0:1], 17, v[0:1]
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_sra_i64_17_to_i16(i64 %x) {
+  %shift = ashr i64 %x, 17
+  %trunc = trunc i64 %shift to i16
+  ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_shl_i64_16_to_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_shl_i64_16_to_i16(i64 %x) {
+  %shift = shl i64 %x, 16
+  %trunc = trunc i64 %shift to i16
+  ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_shl_i64_17_to_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_shl_i64_17_to_i16(i64 %x) {
+  %shift = shl i64 %x, 17
+  %trunc = trunc i64 %shift to i16
+  ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_srl_v2i64_16_to_v2i16:
+; GCN: s_waitcnt
+; GCN-DAG: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0000
+; GCN: v_and_or_b32 v0, v2, [[MASK]], v0
+; GCN-NEXT: s_setpc_b64
+define <2 x i16> @trunc_srl_v2i64_16_to_v2i16(<2 x i64> %x) {
+  %shift = lshr <2 x i64> %x, <i64 16, i64 16>
+  %trunc = trunc <2 x i64> %shift to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+; GCN-LABEL: {{^}}s_trunc_srl_i64_16_to_i16:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCN: s_lshr_b32 [[VAL_SHIFT:s[0-9]+]], [[VAL]], 16
+; GCN: s_or_b32 [[RESULT:s[0-9]+]], [[VAL_SHIFT]], 4
+; GCN: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]]
+; GCN: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
+define amdgpu_kernel void @s_trunc_srl_i64_16_to_i16(i64 %x) {
+  %shift = lshr i64 %x, 16
+  %trunc = trunc i64 %shift to i16
+  %add = or i16 %trunc, 4
+  store i16 %add, i16 addrspace(1)* undef
+  ret void
+}