[llvm] 75b292c - [AMDGPU][DAG] Fix insert_vector_elt lowering for 8 bit elements
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 4 07:48:25 PDT 2022
Author: Pierre van Houtryve
Date: 2022-10-04T14:48:15Z
New Revision: 75b292cb14141cac086aa595154134d789c5fce7
URL: https://github.com/llvm/llvm-project/commit/75b292cb14141cac086aa595154134d789c5fce7
DIFF: https://github.com/llvm/llvm-project/commit/75b292cb14141cac086aa595154134d789c5fce7.diff
LOG: [AMDGPU][DAG] Fix insert_vector_elt lowering for 8 bit elements
The bitmask used to extract the bits assumed 16 bit elements and wasn't taking the size of the elements into account.
Reviewed By: arsenm
Differential Revision: https://reviews.llvm.org/D135156
Added:
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f757ca16ef3e..6e0478ed2f16 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5733,11 +5733,11 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
// Convert vector index to bit-index and get the required bit mask.
assert(isPowerOf2_32(EltSize));
+ const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
- DAG.getConstant(0xffff, SL, IntVT),
- ScaledIdx);
+ DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
// 1. Create a congruent vector with the target value in each element.
SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index c46f853a3da2..f27c6800c69e 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -447,7 +447,7 @@ define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %v
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s6, s[0:1], 0x34
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b64 s[4:5], 0xffff
+; GCN-NEXT: s_mov_b64 s[4:5], 0xff
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshl_b32 s6, s6, 3
; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s6
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 8f84abdc6da1..f1b4399fad09 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1033,7 +1033,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshl_b32 s5, s6, 3
-; SI-NEXT: s_lshl_b32 s5, -1, s5
+; SI-NEXT: s_lshl_b32 s5, 0xff, s5
; SI-NEXT: s_andn2_b32 s4, s4, s5
; SI-NEXT: s_and_b32 s5, s5, 0x505
; SI-NEXT: s_or_b32 s4, s5, s4
@@ -1046,14 +1046,15 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou
; VI-NEXT: s_load_dword s6, s[4:5], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: s_load_dword s4, s[4:5], 0x28
+; VI-NEXT: v_mov_b32_e32 v0, 0xff
; VI-NEXT: s_mov_b32 s3, 0x1100f000
-; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s5, s6, 3
-; VI-NEXT: v_lshlrev_b16_e64 v0, s5, -1
+; VI-NEXT: v_lshlrev_b16_e32 v0, s5, v0
; VI-NEXT: v_not_b32_e32 v1, v0
; VI-NEXT: v_and_b32_e32 v1, s4, v1
; VI-NEXT: v_and_b32_e32 v0, 0x505, v0
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
@@ -1074,7 +1075,7 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %ou
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshl_b32 s5, s6, 3
-; SI-NEXT: s_lshl_b32 s5, 0xffff, s5
+; SI-NEXT: s_lshl_b32 s5, 0xff, s5
; SI-NEXT: s_andn2_b32 s4, s4, s5
; SI-NEXT: s_and_b32 s5, s5, 0x5050505
; SI-NEXT: s_or_b32 s4, s5, s4
@@ -1094,7 +1095,7 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %ou
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s5, s6, 3
-; VI-NEXT: s_lshl_b32 s5, 0xffff, s5
+; VI-NEXT: s_lshl_b32 s5, 0xff, s5
; VI-NEXT: s_andn2_b32 s4, s4, s5
; VI-NEXT: s_and_b32 s5, s5, 0x5050505
; VI-NEXT: s_or_b32 s4, s5, s4
@@ -1119,7 +1120,7 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %ou
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshl_b32 s5, s6, 3
-; SI-NEXT: s_lshl_b32 s5, 0xffff, s5
+; SI-NEXT: s_lshl_b32 s5, 0xff, s5
; SI-NEXT: s_andn2_b32 s4, s4, s5
; SI-NEXT: s_and_b32 s5, s5, 0x5050505
; SI-NEXT: s_or_b32 s4, s5, s4
@@ -1136,7 +1137,7 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %ou
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s5, s6, 3
-; VI-NEXT: s_lshl_b32 s5, 0xffff, s5
+; VI-NEXT: s_lshl_b32 s5, 0xff, s5
; VI-NEXT: s_andn2_b32 s4, s4, s5
; VI-NEXT: s_and_b32 s5, s5, 0x5050505
; VI-NEXT: s_or_b32 s4, s5, s4
@@ -1160,7 +1161,7 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SI-NEXT: s_lshl_b32 s8, s8, 3
-; SI-NEXT: s_mov_b64 s[2:3], 0xffff
+; SI-NEXT: s_mov_b64 s[2:3], 0xff
; SI-NEXT: s_lshl_b64 s[2:3], s[2:3], s8
; SI-NEXT: s_and_b32 s9, s3, 0x5050505
; SI-NEXT: s_and_b32 s8, s2, 0x5050505
@@ -1183,7 +1184,7 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; VI-NEXT: s_lshl_b32 s8, s8, 3
-; VI-NEXT: s_mov_b64 s[2:3], 0xffff
+; VI-NEXT: s_mov_b64 s[2:3], 0xff
; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], s8
; VI-NEXT: s_and_b32 s9, s3, 0x5050505
; VI-NEXT: s_and_b32 s8, s2, 0x5050505
More information about the llvm-commits
mailing list