[llvm] r331909 - AMDGPU: Add combine for trunc of bitcast from build_vector
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed May 9 11:37:39 PDT 2018
Author: arsenm
Date: Wed May 9 11:37:39 2018
New Revision: 331909
URL: http://llvm.org/viewvc/llvm-project?rev=331909&view=rev
Log:
AMDGPU: Add combine for trunc of bitcast from build_vector
If the truncate is only accessing the first element of the vector,
we can use the original source value.
This helps with some combine ordering issues after operations are
lowered to integer operations between bitcasts of build_vector.
In particular it stops unnecessarily materializing the unused
top half of a vector in some cases.
Added:
llvm/trunk/test/CodeGen/AMDGPU/trunc-combine.ll
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll
llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll
llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll
llvm/trunk/test/CodeGen/AMDGPU/store-weird-sizes.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=331909&r1=331908&r2=331909&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Wed May 9 11:37:39 2018
@@ -574,6 +574,7 @@ AMDGPUTargetLowering::AMDGPUTargetLoweri
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRA);
setTargetDAGCombine(ISD::SRL);
+ setTargetDAGCombine(ISD::TRUNCATE);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::MULHU);
setTargetDAGCombine(ISD::MULHS);
@@ -3119,6 +3120,33 @@ SDValue AMDGPUTargetLowering::performSrl
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
}
+SDValue AMDGPUTargetLowering::performTruncateCombine(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SDLoc SL(N);
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+
+ // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
+ if (Src.getOpcode() == ISD::BITCAST) {
+ SDValue Vec = Src.getOperand(0);
+ if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
+ SDValue Elt0 = Vec.getOperand(0);
+ EVT EltVT = Elt0.getValueType();
+ if (VT.getSizeInBits() <= EltVT.getSizeInBits()) {
+ if (EltVT.isFloatingPoint()) {
+ Elt0 = DAG.getNode(ISD::BITCAST, SL,
+ EltVT.changeTypeToInteger(), Elt0);
+ }
+
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
+ }
+ }
+ }
+
+ return SDValue();
+}
+
// We need to specifically handle i64 mul here to avoid unnecessary conversion
// instructions. If we only match on the legalized i64 mul expansion,
// SimplifyDemandedBits will be unable to remove them because there will be
@@ -3758,6 +3786,8 @@ SDValue AMDGPUTargetLowering::PerformDAG
return performSraCombine(N, DCI);
}
+ case ISD::TRUNCATE:
+ return performTruncateCombine(N, DCI);
case ISD::MUL:
return performMulCombine(N, DCI);
case ISD::MULHS:
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h?rev=331909&r1=331908&r2=331909&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h Wed May 9 11:37:39 2018
@@ -87,6 +87,7 @@ protected:
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
Modified: llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll?rev=331909&r1=331908&r2=331909&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll Wed May 9 11:37:39 2018
@@ -282,7 +282,7 @@ define <2 x i16> @v2i16_func_void() #0 {
}
; GCN-LABEL: {{^}}v3i16_func_void:
-; GFX9: buffer_load_dwordx2 v[0:1], off
+; GFX9: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off
; GFX9: s_waitcnt vmcnt(0)
; GFX9: v_lshrrev_b32
; GFX9: s_setpc_b64
@@ -304,9 +304,8 @@ define <4 x i16> @v4i16_func_void() #0 {
; GCN-LABEL: {{^}}v5i16_func_void:
; GFX9: buffer_load_dwordx2 v[0:1]
; GFX9: buffer_load_ushort v4
+; GFX9: v_lshrrev_b32_e32 v5, 16, v0
; GFX9: v_lshrrev_b32_e32 v3, 16, v1
-; GFX9: v_mov_b32_e32 v2, v1
-; GFX9: v_lshrrev_b32_e32 v1, 16, v0
; GCN: s_setpc_b64
define <5 x i16> @v5i16_func_void() #0 {
%ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(4)* undef
Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll?rev=331909&r1=331908&r2=331909&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll Wed May 9 11:37:39 2018
@@ -8,11 +8,11 @@
; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
; PACKED: image_gather4 v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
define amdgpu_kernel void @image_gather4_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
main_body:
%tex = call <4 x half> @llvm.amdgcn.image.gather4.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -26,11 +26,11 @@ main_body:
; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
; PACKED: image_gather4_cl v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
define amdgpu_kernel void @image_gather4_cl_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
main_body:
%tex = call <4 x half> @llvm.amdgcn.image.gather4.cl.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -44,11 +44,11 @@ main_body:
; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
; PACKED: image_gather4_c v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
define amdgpu_kernel void @image_gather4_c_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
main_body:
%tex = call <4 x half> @llvm.amdgcn.image.gather4.c.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -62,11 +62,11 @@ main_body:
; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
; PACKED: image_gather4_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
define amdgpu_kernel void @image_gather4_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
main_body:
%tex = call <4 x half> @llvm.amdgcn.image.gather4.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -80,11 +80,11 @@ main_body:
; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
; PACKED: image_gather4_c_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
define amdgpu_kernel void @image_gather4_c_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
main_body:
%tex = call <4 x half> @llvm.amdgcn.image.gather4.c.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll?rev=331909&r1=331908&r2=331909&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll Wed May 9 11:37:39 2018
@@ -41,11 +41,11 @@ main_body:
; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
; PACKED: image_sample v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
define amdgpu_kernel void @image_sample_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
main_body:
%tex = call <4 x half> @llvm.amdgcn.image.sample.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -59,11 +59,11 @@ main_body:
; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
; PACKED: image_sample_cl v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
define amdgpu_kernel void @image_sample_cl_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
main_body:
%tex = call <4 x half> @llvm.amdgcn.image.sample.cl.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -77,11 +77,11 @@ main_body:
; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
; PACKED: image_sample_c v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
define amdgpu_kernel void @image_sample_c_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
main_body:
%tex = call <4 x half> @llvm.amdgcn.image.sample.c.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -95,10 +95,11 @@ main_body:
; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
; PACKED: image_sample_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
define amdgpu_kernel void @image_sample_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
main_body:
%tex = call <4 x half> @llvm.amdgcn.image.sample.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -112,10 +113,11 @@ main_body:
; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
; PACKED: image_sample_c_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
define amdgpu_kernel void @image_sample_c_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
main_body:
%tex = call <4 x half> @llvm.amdgcn.image.sample.c.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
Modified: llvm/trunk/test/CodeGen/AMDGPU/store-weird-sizes.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/store-weird-sizes.ll?rev=331909&r1=331908&r2=331909&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/store-weird-sizes.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/store-weird-sizes.ll Wed May 9 11:37:39 2018
@@ -3,18 +3,28 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
; GCN-LABEL: {{^}}local_store_i56:
-; GCN-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6
-; GCN-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
-; GCN-DAG: ds_write_b32 v0, v{{[0-9]+$}}
+; CIVI-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6
+; CIVI-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
+; CIVI-DAG: ds_write_b32 v0, v{{[0-9]+$}}
+
+; GFX9-DAG: ds_write_b8_d16_hi v0, v{{[0-9]+}} offset:6
+; GFX9-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
+; GFX9-DAG: ds_write_b32 v0, v{{[0-9]+$}}
+
+
define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 {
store i56 %arg, i56 addrspace(3)* %ptr, align 8
ret void
}
; GCN-LABEL: {{^}}local_store_i55:
-; GCN-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6
-; GCN-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
-; GCN-DAG: ds_write_b32 v0, v{{[0-9]+$}}
+; CIVI-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6
+; CIVI-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
+; CIVI-DAG: ds_write_b32 v0, v{{[0-9]+$}}
+
+; GFX9-DAG: ds_write_b8_d16_hi v0, v{{[0-9]+}} offset:6
+; GFX9-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
+; GFX9-DAG: ds_write_b32 v0, v{{[0-9]+$}}
define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 {
store i55 %arg, i55 addrspace(3)* %ptr, align 8
ret void
Added: llvm/trunk/test/CodeGen/AMDGPU/trunc-combine.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/trunc-combine.ll?rev=331909&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/trunc-combine.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/trunc-combine.ll Wed May 9 11:37:39 2018
@@ -0,0 +1,55 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+
+; Make sure high constant 0 isn't pointlessly materialized
+; GCN-LABEL: {{^}}trunc_bitcast_i64_lshr_32_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_bitcast_i64_lshr_32_i16(i64 %bar) {
+ %srl = lshr i64 %bar, 32
+ %trunc = trunc i64 %srl to i16
+ ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_bitcast_i64_lshr_32_i32:
+; GCN: s_waitcnt
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64
+define i32 @trunc_bitcast_i64_lshr_32_i32(i64 %bar) {
+ %srl = lshr i64 %bar, 32
+ %trunc = trunc i64 %srl to i32
+ ret i32 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_bitcast_v2i32_to_i16:
+; GCN: _load_dword
+; GCN-NOT: _load_dword
+; GCN-NOT: v_mov_b32
+; GCN: v_add_u32_e32 v0, vcc, 4, v0
+define i16 @trunc_bitcast_v2i32_to_i16(<2 x i32> %bar) {
+ %load0 = load i32, i32 addrspace(1)* undef
+ %load1 = load i32, i32 addrspace(1)* null
+ %insert.0 = insertelement <2 x i32> undef, i32 %load0, i32 0
+ %insert.1 = insertelement <2 x i32> %insert.0, i32 99, i32 1
+ %bc = bitcast <2 x i32> %insert.1 to i64
+ %trunc = trunc i64 %bc to i16
+ %add = add i16 %trunc, 4
+ ret i16 %add
+}
+
+; Make sure there's no crash if the source vector type is FP
+; GCN-LABEL: {{^}}trunc_bitcast_v2f32_to_i16:
+; GCN: _load_dword
+; GCN-NOT: _load_dword
+; GCN-NOT: v_mov_b32
+; GCN: v_add_u32_e32 v0, vcc, 4, v0
+define i16 @trunc_bitcast_v2f32_to_i16(<2 x float> %bar) {
+ %load0 = load float, float addrspace(1)* undef
+ %load1 = load float, float addrspace(1)* null
+ %insert.0 = insertelement <2 x float> undef, float %load0, i32 0
+ %insert.1 = insertelement <2 x float> %insert.0, float 4.0, i32 1
+ %bc = bitcast <2 x float> %insert.1 to i64
+ %trunc = trunc i64 %bc to i16
+ %add = add i16 %trunc, 4
+ ret i16 %add
+}
More information about the llvm-commits
mailing list