[llvm] r352885 - [AMDGPU] Fix for vector element insertion
Tim Corringham via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 1 08:51:09 PST 2019
Author: timcorringham
Date: Fri Feb 1 08:51:09 2019
New Revision: 352885
URL: http://llvm.org/viewvc/llvm-project?rev=352885&view=rev
Log:
[AMDGPU] Fix for vector element insertion
Summary:
Incorrect code was generated when lowering insertelement operations
for vectors with 8 or 16 bit elements. The value being inserted was
not adjusted for the position of the element within the 32 bit word
and so only the low element within each 32 bit word could receive
the intended value.
Fixed by simply replicating the value to each element of a
congruent vector before the mask and or operation used to
update the intended element.
A number of affected LIT tests have been updated appropriately.
before the mask & or into the intended
Reviewers: arsenm, nhaehnle
Reviewed By: arsenm
Subscribers: llvm-commits, arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D57588
Modified:
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
llvm/trunk/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll
llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll
llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=352885&r1=352884&r2=352885&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Fri Feb 1 08:51:09 2019
@@ -4369,12 +4369,12 @@ SDValue SITargetLowering::lowerINSERT_VE
MVT IntVT = MVT::getIntegerVT(VecSize);
// Avoid stack access for dynamic indexing.
- SDValue Val = InsVal;
- if (InsVal.getValueType() == MVT::f16)
- Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
-
// v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
- SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
+
+ // Create a congruent vector with the target value in each element so that
+ // the required element can be masked and ORed into the target vector.
+ SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
+ DAG.getSplatBuildVector(VecVT, SL, InsVal));
assert(isPowerOf2_32(EltSize));
SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
Modified: llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll?rev=352885&r1=352884&r2=352885&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll Fri Feb 1 08:51:09 2019
@@ -814,8 +814,8 @@ define half @v_test_canonicalize_extract
}
; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_v2f16:
-; GFX9: v_pk_mul_f16
; GFX9: v_mul_f16_e32
+; GFX9: v_pk_mul_f16
; GFX9-NOT: v_max
; GFX9-NOT: v_pk_max
define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) {
Modified: llvm/trunk/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/insert_vector_dynelt.ll?rev=352885&r1=352884&r2=352885&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/insert_vector_dynelt.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/insert_vector_dynelt.ll Fri Feb 1 08:51:09 2019
@@ -112,7 +112,10 @@ entry:
; GCN-NOT: buffer_
; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3c00
+; GCN: s_mov_b32 [[K:s[0-9]+]], 0x3c003c00
+; GCN: v_mov_b32_e32 [[V:v[0-9]+]], [[K]]
+; GCN: v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}}
+; GCN: v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}}
define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) {
entry:
%v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel
@@ -168,9 +171,10 @@ entry:
; GCN-NOT: v_cndmask_b32
; GCN-NOT: v_movrel
; GCN-NOT: buffer_
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x10001
; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
; GCN: s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]]
-; GCN: v_bfi_b32 v{{[0-9]+}}, [[V]], 1, v{{[0-9]+}}
+; GCN: v_bfi_b32 v{{[0-9]+}}, [[V]], [[K]], v{{[0-9]+}}
define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) {
entry:
%v = insertelement <2 x i16> %vec, i16 1, i32 %sel
@@ -184,7 +188,10 @@ entry:
; GCN-NOT: buffer_
; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
+; GCN: s_mov_b32 [[K:s[0-9]+]], 0x10001
+; GCN: v_mov_b32_e32 [[V:v[0-9]+]], [[K]]
+; GCN: v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}}
+; GCN: v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}}
define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) {
entry:
%v = insertelement <4 x i16> %vec, i16 1, i32 %sel
@@ -197,7 +204,11 @@ entry:
; GCN-NOT: buffer_
; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3
; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
+; GCN: s_mov_b32 [[K:s[0-9]+]], 0x1010101
+; GCN: s_and_b32 s3, s1, [[K]]
+; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]]
+; GCN: s_andn2_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
+; GCN: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) {
entry:
%v = insertelement <8 x i8> %vec, i8 1, i32 %sel
Modified: llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll?rev=352885&r1=352884&r2=352885&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll Fri Feb 1 08:51:09 2019
@@ -242,7 +242,7 @@ define amdgpu_kernel void @dynamic_inser
; VI-NOT: _load
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
; VI: v_lshlrev_b16_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], -1
-; VI: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]]
+; VI: v_and_b32_e32 [[INSERT:v[0-9]+]], 0x505, [[MASK]]
; VI: v_xor_b32_e32 [[NOT_MASK:v[0-9]+]], -1, [[MASK]]
; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[LOAD]], [[NOT_MASK]]
; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[AND_NOT_MASK]]
@@ -261,15 +261,14 @@ define amdgpu_kernel void @dynamic_inser
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
; VI-NOT: _load
+; VI: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x5050505
; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
-; VI: s_andn2_b32 [[AND_NOT_MASK:s[0-9]+]], [[LOAD]], [[SHIFTED_MASK]]
-; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]]
-; VI: s_lshr_b32 [[HI2:s[0-9]+]], [[AND_NOT_MASK]], 16
+; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], [[VAL]], [[V_LOAD]]
+; VI: v_lshrrev_b32_e32 [[V_HI2:v[0-9]+]], 16, [[BFI]]
-; VI-DAG: buffer_store_short [[BFI]]
-; VI-DAG: v_mov_b32_e32 [[V_HI2:v[0-9]+]], [[HI2]]
+; VI: buffer_store_short [[BFI]]
; VI: buffer_store_byte [[V_HI2]]
define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
%vecins = insertelement <3 x i8> %a, i8 5, i32 %b
@@ -282,10 +281,11 @@ define amdgpu_kernel void @dynamic_inser
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
; VI-NOT: _load
+; VI: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x5050505
; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
-; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]]
+; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], [[VAL]], [[V_LOAD]]
; VI: buffer_store_dword [[BFI]]
define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
%vecins = insertelement <4 x i8> %a, i8 5, i32 %b
@@ -303,9 +303,11 @@ define amdgpu_kernel void @dynamic_inser
; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff
; VI: s_lshl_b64 s{{\[}}[[MASK_SHIFT_LO:[0-9]+]]:[[MASK_SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]]
+; VI: s_mov_b32 [[VAL:s[0-9]+]], 0x5050505
+; VI: s_and_b32 s[[INS_HI:[0-9]+]], s[[MASK_SHIFT_HI]], [[VAL]]
+; VI: s_and_b32 s[[INS_LO:[0-9]+]], s[[MASK_SHIFT_LO]], [[VAL]]
; VI: s_andn2_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[VEC]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}}
-; VI: s_and_b32 s[[INS:[0-9]+]], s[[MASK_SHIFT_LO]], 5
-; VI: s_or_b64 s{{\[}}[[RESULT0:[0-9]+]]:[[RESULT1:[0-9]+]]{{\]}}, s{{\[}}[[INS]]:[[MASK_HI]]{{\]}}, [[AND]]
+; VI: s_or_b64 s{{\[}}[[RESULT0:[0-9]+]]:[[RESULT1:[0-9]+]]{{\]}}, s{{\[}}[[INS_LO]]:[[INS_HI]]{{\]}}, [[AND]]
; VI: v_mov_b32_e32 v[[V_RESULT0:[0-9]+]], s[[RESULT0]]
; VI: v_mov_b32_e32 v[[V_RESULT1:[0-9]+]], s[[RESULT1]]
; VI: buffer_store_dwordx2 v{{\[}}[[V_RESULT0]]:[[V_RESULT1]]{{\]}}
Modified: llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll?rev=352885&r1=352884&r2=352885&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll Fri Feb 1 08:51:09 2019
@@ -446,7 +446,7 @@ define amdgpu_kernel void @v_inserteleme
; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
-; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234
+; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0x12341234
; GCN-DAG: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
@@ -611,25 +611,20 @@ define amdgpu_kernel void @v_inserteleme
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
-; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
+; GCN-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff
; GCN-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0
-; GCN-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff{{$}}
-
-; GFX89: v_lshlrev_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, [[SCALED_IDX]], s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}
-; GFX89-DAG: v_not_b32_e32 v[[NOT_SHIFT_LO:[0-9+]]], v[[SHIFT_LO]]
-; GFX89-DAG: v_not_b32_e32 v[[NOT_SHIFT_HI:[0-9+]]], v[[SHIFT_HI]]
-; GFX89-DAG: v_and_b32_e32 v[[MASK:[0-9]+]], [[VAL]], v[[SHIFT_LO]]
-
-; GFX89-DAG: v_and_b32_e32 v[[AND0:[0-9]+]], v[[NOT_SHIFT_LO]], v[[LO]]
-; GFX89-DAG: v_and_b32_e32 v[[AND1:[0-9]+]], v[[NOT_SHIFT_HI]], v[[HI]]
-; GFX89: v_or_b32_sdwa v[[OR_SDWA:[0-9]+]], v[[MASK]], v[[AND0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-
-
-; CI: v_lshl_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]]
-; CI-DAG: v_bfi_b32 v[[OR_SDWA:[0-9]+]], v[[SHIFT_LO]],
-; CI-DAG: v_bfi_b32 v[[AND1:[0-9]+]], v[[SHIFT_HI]], 0,
+; CIVI-DAG: s_and_b32 [[MASKED_VAL:s[0-9]+]], [[VAL]], s[[MASK_LO]]
+; VI-DAG: s_lshl_b32 [[SHIFTED_VAL:s[0-9]+]], [[MASKED_VAL]], 16
+; CI-DAG: s_lshl_b32 [[SHIFTED_VAL:s[0-9]+]], [[VAL]], 16
+; CIVI: s_or_b32 [[DUP_VAL:s[0-9]+]], [[MASKED_VAL]], [[SHIFTED_VAL]]
+; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
+; GFX9-DAG: s_pack_ll_b32_b16 [[DUP_VAL:s[0-9]+]], [[VAL]], [[VAL]]
+; GFX89: v_lshlrev_b64 v[{{[0-9:]+}}], [[SCALED_IDX]], s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}
+; CI: v_lshl_b64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SCALED_IDX]]
+; GCN: v_bfi_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[DUP_VAL]], v{{[0-9]+}}
+; GCN: v_bfi_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[DUP_VAL]], v{{[0-9]+}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[OR_SDWA]]:[[AND1]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
Modified: llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll?rev=352885&r1=352884&r2=352885&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll Fri Feb 1 08:51:09 2019
@@ -3,7 +3,7 @@
; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
-; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
+; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e703e7
; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
Modified: llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll?rev=352885&r1=352884&r2=352885&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll Fri Feb 1 08:51:09 2019
@@ -6,7 +6,7 @@
; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
-; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
+; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e7
; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
More information about the llvm-commits
mailing list