[llvm] c8ed362 - [AMDGPU] Cast sub-dword elements to i32 in concat_vectors
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 9 15:35:57 PST 2023
Author: Stanislav Mekhanoshin
Date: 2023-01-09T15:35:49-08:00
New Revision: c8ed36281a929c34477ffc391a8a3e84d1a07a92
URL: https://github.com/llvm/llvm-project/commit/c8ed36281a929c34477ffc391a8a3e84d1a07a92
DIFF: https://github.com/llvm/llvm-project/commit/c8ed36281a929c34477ffc391a8a3e84d1a07a92.diff
LOG: [AMDGPU] Cast sub-dword elements to i32 in concat_vectors
This produces better code by avoiding repacking in some cases.
Fixes: SWDEV-373436
Differential Revision: https://reviews.llvm.org/D141329
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 27ea9c3913a3..8121b381e83f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1344,21 +1344,36 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
SelectionDAG &DAG) const {
SmallVector<SDValue, 8> Args;
+ SDLoc SL(Op);
EVT VT = Op.getValueType();
- if (VT == MVT::v4i16 || VT == MVT::v4f16) {
- SDLoc SL(Op);
- SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
- SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
+ if (VT.getVectorElementType().getSizeInBits() < 32) {
+ unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
+ if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
+ unsigned NewNumElt = OpBitSize / 32;
+ EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
+ : EVT::getVectorVT(*DAG.getContext(),
+ MVT::i32, NewNumElt);
+ for (const SDUse &U : Op->ops()) {
+ SDValue In = U.get();
+ SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
+ if (NewNumElt > 1)
+ DAG.ExtractVectorElements(NewIn, Args);
+ else
+ Args.push_back(NewIn);
+ }
- SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
- return DAG.getNode(ISD::BITCAST, SL, VT, BV);
+ EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ NewNumElt * Op.getNumOperands());
+ SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
+ return DAG.getNode(ISD::BITCAST, SL, VT, BV);
+ }
}
for (const SDUse &U : Op->ops())
DAG.ExtractVectorElements(U.get(), Args);
- return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
+ return DAG.getBuildVector(Op.getValueType(), SL, Args);
}
SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index d0f11699c9a4..b023066b37d8 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -2273,21 +2273,12 @@ define void @shuffle_v16f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
; GFX9-LABEL: shuffle_v16f16_concat:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
-; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off
-; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
+; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_bfi_b32 v3, s4, v9, v9
-; GFX9-NEXT: v_bfi_b32 v2, s4, v8, v8
-; GFX9-NEXT: v_bfi_b32 v1, s4, v7, v7
-; GFX9-NEXT: v_bfi_b32 v0, s4, v6, v6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_bfi_b32 v9, s4, v13, v13
-; GFX9-NEXT: v_bfi_b32 v8, s4, v12, v12
-; GFX9-NEXT: v_bfi_b32 v7, s4, v11, v11
-; GFX9-NEXT: v_bfi_b32 v6, s4, v10, v10
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16
-; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: global_store_dwordx4 v[4:5], v[10:13], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -2295,20 +2286,12 @@ define void @shuffle_v16f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
-; GFX10-NEXT: global_load_dwordx4 v[10:13], v[2:3], off
+; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
+; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_bfi_b32 v3, 0xffff, v9, v9
+; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_bfi_b32 v13, 0xffff, v13, v13
-; GFX10-NEXT: v_bfi_b32 v12, 0xffff, v12, v12
-; GFX10-NEXT: v_bfi_b32 v11, 0xffff, v11, v11
-; GFX10-NEXT: v_bfi_b32 v10, 0xffff, v10, v10
-; GFX10-NEXT: v_bfi_b32 v2, 0xffff, v8, v8
-; GFX10-NEXT: v_bfi_b32 v1, 0xffff, v7, v7
-; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v6, v6
-; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:16
-; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
+; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -2316,21 +2299,12 @@ define void @shuffle_v16f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_load_b128 v[6:9], v[0:1], off
-; GFX11-NEXT: global_load_b128 v[0:3], v[2:3], off
+; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off
+; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_bfi_b32 v9, 0xffff, v9, v9
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_bfi_b32 v3, 0xffff, v3, v3
-; GFX11-NEXT: v_bfi_b32 v2, 0xffff, v2, v2
-; GFX11-NEXT: v_bfi_b32 v1, 0xffff, v1, v1
-; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v0, v0
-; GFX11-NEXT: v_bfi_b32 v8, 0xffff, v8, v8
-; GFX11-NEXT: v_bfi_b32 v7, 0xffff, v7, v7
-; GFX11-NEXT: v_bfi_b32 v6, 0xffff, v6, v6
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off offset:16
-; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off offset:16
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val0 = load <8 x half>, ptr addrspace(1) %arg0
More information about the llvm-commits
mailing list