[llvm] ef13308 - AMDGPU/SDAG: Improve {extract,insert}_subvector lowering for 16-bit vectors
Nicolai Hähnle via llvm-commits
llvm-commits at lists.llvm.org
Fri May 5 01:55:29 PDT 2023
Author: Nicolai Hähnle
Date: 2023-05-05T10:55:18+02:00
New Revision: ef13308b2666be876f5a93c68c449d9fece48ac0
URL: https://github.com/llvm/llvm-project/commit/ef13308b2666be876f5a93c68c449d9fece48ac0
DIFF: https://github.com/llvm/llvm-project/commit/ef13308b2666be876f5a93c68c449d9fece48ac0.diff
LOG: AMDGPU/SDAG: Improve {extract,insert}_subvector lowering for 16-bit vectors
v2:
- simplify the escape to TableGen patterns
Differential Revision: https://reviews.llvm.org/D149841
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 79f31c2350733..e7a7b205b247f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1423,32 +1423,42 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
SelectionDAG &DAG) const {
-
+ SDLoc SL(Op);
SmallVector<SDValue, 8> Args;
unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
EVT VT = Op.getValueType();
EVT SrcVT = Op.getOperand(0).getValueType();
- // For these types, we have some TableGen patterns except if the index is 1
- if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
- (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
- Start != 1)
- return Op;
+ if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
+ unsigned NumElt = VT.getVectorNumElements();
+ unsigned NumSrcElt = SrcVT.getVectorNumElements();
+ assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
- if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) ||
- (SrcVT == MVT::v8i16 && VT == MVT::v4i16)) &&
- (Start == 0 || Start == 4))
- return Op;
+ // We have some TableGen patterns for when the extracted vector is exactly
+ // the low or high half of the operand.
+ if ((NumSrcElt == 2 * NumElt) && (Start == 0 || Start == NumElt))
+ return Op;
- if (((SrcVT == MVT::v16f16 && VT == MVT::v8f16) ||
- (SrcVT == MVT::v16i16 && VT == MVT::v8i16)) &&
- (Start == 0 || Start == 8))
- return Op;
+ // Extract 32-bit registers at a time.
+ EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
+ EVT NewVT = NumElt == 2
+ ? MVT::i32
+ : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
+ SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
+
+ DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
+ if (NumElt == 2)
+ Tmp = Args[0];
+ else
+ Tmp = DAG.getBuildVector(NewVT, SL, Args);
+
+ return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
+ }
DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
VT.getVectorNumElements());
- return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
+ return DAG.getBuildVector(Op.getValueType(), SL, Args);
}
// TODO: Handle fabs too
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0073929e9c508..cbbb2f6b9f40b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5762,6 +5762,35 @@ SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
SDLoc SL(Op);
+ if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
+ // Insert 32-bit registers at a time.
+ assert(InsNumElts % 2 == 0 && "expect legal vector types");
+
+ unsigned VecNumElts = VecVT.getVectorNumElements();
+ EVT NewVecVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
+ EVT NewInsVT = InsNumElts == 2 ? MVT::i32
+ : EVT::getVectorVT(*DAG.getContext(),
+ MVT::i32, InsNumElts / 2);
+
+ Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
+ Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
+
+ for (unsigned I = 0; I != InsNumElts / 2; ++I) {
+ SDValue Elt;
+ if (InsNumElts == 2) {
+ Elt = Ins;
+ } else {
+ Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
+ DAG.getConstant(I, SL, MVT::i32));
+ }
+ Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
+ DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
+ }
+
+ return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
+ }
+
for (unsigned I = 0; I != InsNumElts; ++I) {
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
DAG.getConstant(I, SL, MVT::i32));
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index d04ffa351b797..a83e98ba7773b 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -213,8 +213,6 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: .LBB4_3: ; %if.end
-; GCN-NEXT: s_mov_b32 s4, 0xffff
-; GCN-NEXT: v_bfi_b32 v0, s4, v0, v0
; GCN-NEXT: global_store_short v[0:1], v1, off
; GCN-NEXT: global_store_dword v[0:1], v0, off
; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index c7f249985841f..6c564b244012a 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -947,16 +947,7 @@ define <8 x i16> @large_vector(ptr addrspace(3) %p, i32 %idxp) {
; GFX9-NEXT: v_lshl_add_u32 v2, v1, 5, v0
; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
-; GFX9-NEXT: s_mov_b32 s4, 0xffff
-; GFX9-NEXT: s_waitcnt lgkmcnt(1)
-; GFX9-NEXT: v_bfi_b32 v4, s4, v0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_bfi_b32 v5, s4, v2, v2
-; GFX9-NEXT: v_bfi_b32 v4, s4, v0, v4
-; GFX9-NEXT: v_bfi_b32 v4, s4, v0, v4
-; GFX9-NEXT: v_bfi_b32 v5, s4, v2, v5
-; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v5
-; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
%idx = shl i32 %idxp, 4
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
index 0f65920946ab1..baee88b69d060 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
@@ -10,13 +10,12 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: s_pack_lh_b32_b16 s4, s0, s0
; GFX900-NEXT: v_mov_b32_e32 v5, s3
; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: v_mov_b32_e32 v2, s0
; GFX900-NEXT: v_mov_b32_e32 v3, s0
-; GFX900-NEXT: v_mov_b32_e32 v0, s4
-; GFX900-NEXT: v_mov_b32_e32 v2, s4
; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX900-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX900-NEXT: s_endpgm
@@ -26,13 +25,12 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: s_pack_lh_b32_b16 s4, s0, s0
; GFX906-NEXT: v_mov_b32_e32 v5, s3
; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
+; GFX906-NEXT: v_mov_b32_e32 v0, s0
; GFX906-NEXT: v_mov_b32_e32 v1, s1
+; GFX906-NEXT: v_mov_b32_e32 v2, s0
; GFX906-NEXT: v_mov_b32_e32 v3, s0
-; GFX906-NEXT: v_mov_b32_e32 v0, s4
-; GFX906-NEXT: v_mov_b32_e32 v2, s4
; GFX906-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX906-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX906-NEXT: s_endpgm
@@ -42,13 +40,12 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_pack_lh_b32_b16 s4, s0, s0
; GFX908-NEXT: v_mov_b32_e32 v5, s3
; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
+; GFX908-NEXT: v_mov_b32_e32 v0, s0
; GFX908-NEXT: v_mov_b32_e32 v1, s1
+; GFX908-NEXT: v_mov_b32_e32 v2, s0
; GFX908-NEXT: v_mov_b32_e32 v3, s0
-; GFX908-NEXT: v_mov_b32_e32 v0, s4
-; GFX908-NEXT: v_mov_b32_e32 v2, s4
; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX908-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX908-NEXT: s_endpgm
@@ -58,13 +55,12 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s0, s0
; GFX90A-NEXT: v_mov_b32_e32 v5, s3
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NEXT: v_mov_b32_e32 v2, s0
; GFX90A-NEXT: v_mov_b32_e32 v3, s0
-; GFX90A-NEXT: v_mov_b32_e32 v0, s4
-; GFX90A-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX90A-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX90A-NEXT: s_endpgm
More information about the llvm-commits
mailing list