[llvm] r331909 - AMDGPU: Add combine for trunc of bitcast from build_vector

Wed May 9 11:37:39 PDT 2018

Author: arsenm
Date: Wed May  9 11:37:39 2018
New Revision: 331909

URL: http://llvm.org/viewvc/llvm-project?rev=331909&view=rev
Log:
AMDGPU: Add combine for trunc of bitcast from build_vector

If the truncate is only accessing the first element of the vector,
we can use the original source value.

This helps with some combine ordering issues after operations are
lowered to integer operations between bitcasts of build_vector.
In particular it stops unnecessarily materializing the unused
top half of a vector in some cases.

Added:
    llvm/trunk/test/CodeGen/AMDGPU/trunc-combine.ll
Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
    llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll
    llvm/trunk/test/CodeGen/AMDGPU/store-weird-sizes.ll

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=331909&r1=331908&r2=331909&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Wed May  9 11:37:39 2018
@@ -574,6 +574,7 @@ AMDGPUTargetLowering::AMDGPUTargetLoweri
   setTargetDAGCombine(ISD::SHL);
   setTargetDAGCombine(ISD::SRA);
   setTargetDAGCombine(ISD::SRL);
+  setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::MULHU);
   setTargetDAGCombine(ISD::MULHS);
@@ -3119,6 +3120,33 @@ SDValue AMDGPUTargetLowering::performSrl
   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
 }
 
+SDValue AMDGPUTargetLowering::performTruncateCombine(
+  SDNode *N, DAGCombinerInfo &DCI) const {
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  SDValue Src = N->getOperand(0);
+
+  // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
+  if (Src.getOpcode() == ISD::BITCAST) {
+    SDValue Vec = Src.getOperand(0);
+    if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
+      SDValue Elt0 = Vec.getOperand(0);
+      EVT EltVT = Elt0.getValueType();
+      if (VT.getSizeInBits() <= EltVT.getSizeInBits()) {
+        if (EltVT.isFloatingPoint()) {
+          Elt0 = DAG.getNode(ISD::BITCAST, SL,
+                             EltVT.changeTypeToInteger(), Elt0);
+        }
+
+        return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
+      }
+    }
+  }
+
+  return SDValue();
+}
+
 // We need to specifically handle i64 mul here to avoid unnecessary conversion
 // instructions. If we only match on the legalized i64 mul expansion,
 // SimplifyDemandedBits will be unable to remove them because there will be
@@ -3758,6 +3786,8 @@ SDValue AMDGPUTargetLowering::PerformDAG
 
     return performSraCombine(N, DCI);
   }
+  case ISD::TRUNCATE:
+    return performTruncateCombine(N, DCI);
   case ISD::MUL:
     return performMulCombine(N, DCI);
   case ISD::MULHS:

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h?rev=331909&r1=331908&r2=331909&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h Wed May  9 11:37:39 2018
@@ -87,6 +87,7 @@ protected:
   SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;

Modified: llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll?rev=331909&r1=331908&r2=331909&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll Wed May  9 11:37:39 2018
@@ -282,7 +282,7 @@ define <2 x i16> @v2i16_func_void() #0 {
 }
 
 ; GCN-LABEL: {{^}}v3i16_func_void:
-; GFX9: buffer_load_dwordx2 v[0:1], off
+; GFX9: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off
 ; GFX9: s_waitcnt vmcnt(0)
 ; GFX9: v_lshrrev_b32
 ; GFX9: s_setpc_b64
@@ -304,9 +304,8 @@ define <4 x i16> @v4i16_func_void() #0 {
 ; GCN-LABEL: {{^}}v5i16_func_void:
 ; GFX9: buffer_load_dwordx2 v[0:1]
 ; GFX9: buffer_load_ushort v4
+; GFX9: v_lshrrev_b32_e32 v5, 16, v0
 ; GFX9: v_lshrrev_b32_e32 v3, 16, v1
-; GFX9: v_mov_b32_e32 v2, v1
-; GFX9: v_lshrrev_b32_e32 v1, 16, v0
 ; GCN: s_setpc_b64
 define <5 x i16> @v5i16_func_void() #0 {
   %ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(4)* undef

Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll?rev=331909&r1=331908&r2=331909&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll Wed May  9 11:37:39 2018
@@ -8,11 +8,11 @@
 ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
 
 ; PACKED: image_gather4 v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
 
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
 define amdgpu_kernel void @image_gather4_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.gather4.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -26,11 +26,11 @@ main_body:
 ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
 
 ; PACKED: image_gather4_cl v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
 
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
 define amdgpu_kernel void @image_gather4_cl_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.gather4.cl.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -44,11 +44,11 @@ main_body:
 ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
 
 ; PACKED: image_gather4_c v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
 
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
 define amdgpu_kernel void @image_gather4_c_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.gather4.c.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -62,11 +62,11 @@ main_body:
 ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
 
 ; PACKED: image_gather4_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
 
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
 define amdgpu_kernel void @image_gather4_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.gather4.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -80,11 +80,11 @@ main_body:
 ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
 
 ; PACKED: image_gather4_c_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
 
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
 define amdgpu_kernel void @image_gather4_c_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.gather4.c.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)

Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll?rev=331909&r1=331908&r2=331909&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll Wed May  9 11:37:39 2018
@@ -41,11 +41,11 @@ main_body:
 ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
 
 ; PACKED: image_sample v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
 
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
 define amdgpu_kernel void @image_sample_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.sample.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -59,11 +59,11 @@ main_body:
 ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
 
 ; PACKED: image_sample_cl v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
 
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
 define amdgpu_kernel void @image_sample_cl_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.sample.cl.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -77,11 +77,11 @@ main_body:
 ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
 
 ; PACKED: image_sample_c v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
 
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
 define amdgpu_kernel void @image_sample_c_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.sample.c.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -95,10 +95,11 @@ main_body:
 ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
 
 ; PACKED: image_sample_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
 define amdgpu_kernel void @image_sample_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.sample.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -112,10 +113,11 @@ main_body:
 ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
 
 ; PACKED: image_sample_c_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
 define amdgpu_kernel void @image_sample_c_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.sample.c.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)

Modified: llvm/trunk/test/CodeGen/AMDGPU/store-weird-sizes.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/store-weird-sizes.ll?rev=331909&r1=331908&r2=331909&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/store-weird-sizes.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/store-weird-sizes.ll Wed May  9 11:37:39 2018
@@ -3,18 +3,28 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
 ; GCN-LABEL: {{^}}local_store_i56:
-; GCN-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6
-; GCN-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
-; GCN-DAG: ds_write_b32 v0, v{{[0-9]+$}}
+; CIVI-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6
+; CIVI-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
+; CIVI-DAG: ds_write_b32 v0, v{{[0-9]+$}}
+
+; GFX9-DAG: ds_write_b8_d16_hi v0, v{{[0-9]+}} offset:6
+; GFX9-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
+; GFX9-DAG: ds_write_b32 v0, v{{[0-9]+$}}
+
+
 define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 {
   store i56 %arg, i56 addrspace(3)* %ptr, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}local_store_i55:
-; GCN-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6
-; GCN-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
-; GCN-DAG: ds_write_b32 v0, v{{[0-9]+$}}
+; CIVI-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6
+; CIVI-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
+; CIVI-DAG: ds_write_b32 v0, v{{[0-9]+$}}
+
+; GFX9-DAG: ds_write_b8_d16_hi v0, v{{[0-9]+}} offset:6
+; GFX9-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
+; GFX9-DAG: ds_write_b32 v0, v{{[0-9]+$}}
 define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 {
   store i55 %arg, i55 addrspace(3)* %ptr, align 8
   ret void

Added: llvm/trunk/test/CodeGen/AMDGPU/trunc-combine.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/trunc-combine.ll?rev=331909&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/trunc-combine.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/trunc-combine.ll Wed May  9 11:37:39 2018
@@ -0,0 +1,55 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+
+; Make sure high constant 0 isn't pointlessly materialized
+; GCN-LABEL: {{^}}trunc_bitcast_i64_lshr_32_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_bitcast_i64_lshr_32_i16(i64 %bar) {
+  %srl = lshr i64 %bar, 32
+  %trunc = trunc i64 %srl to i16
+  ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_bitcast_i64_lshr_32_i32:
+; GCN: s_waitcnt
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64
+define i32 @trunc_bitcast_i64_lshr_32_i32(i64 %bar) {
+  %srl = lshr i64 %bar, 32
+  %trunc = trunc i64 %srl to i32
+  ret i32 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_bitcast_v2i32_to_i16:
+; GCN: _load_dword
+; GCN-NOT: _load_dword
+; GCN-NOT: v_mov_b32
+; GCN: v_add_u32_e32 v0, vcc, 4, v0
+define i16 @trunc_bitcast_v2i32_to_i16(<2 x i32> %bar) {
+  %load0 = load i32, i32 addrspace(1)* undef
+  %load1 = load i32, i32 addrspace(1)* null
+  %insert.0 = insertelement <2 x i32> undef, i32 %load0, i32 0
+  %insert.1 = insertelement <2 x i32> %insert.0, i32 99, i32 1
+  %bc = bitcast <2 x i32> %insert.1 to i64
+  %trunc = trunc i64 %bc to i16
+  %add = add i16 %trunc, 4
+  ret i16 %add
+}
+
+; Make sure there's no crash if the source vector type is FP
+; GCN-LABEL: {{^}}trunc_bitcast_v2f32_to_i16:
+; GCN: _load_dword
+; GCN-NOT: _load_dword
+; GCN-NOT: v_mov_b32
+; GCN: v_add_u32_e32 v0, vcc, 4, v0
+define i16 @trunc_bitcast_v2f32_to_i16(<2 x float> %bar) {
+  %load0 = load float, float addrspace(1)* undef
+  %load1 = load float, float addrspace(1)* null
+  %insert.0 = insertelement <2 x float> undef, float %load0, i32 0
+  %insert.1 = insertelement <2 x float> %insert.0, float 4.0, i32 1
+  %bc = bitcast <2 x float> %insert.1 to i64
+  %trunc = trunc i64 %bc to i16
+  %add = add i16 %trunc, 4
+  ret i16 %add
+}