[llvm] c8ed362 - [AMDGPU] Cast sub-dword elements to i32 in concat_vectors

Stanislav Mekhanoshin via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 9 15:35:57 PST 2023


Author: Stanislav Mekhanoshin
Date: 2023-01-09T15:35:49-08:00
New Revision: c8ed36281a929c34477ffc391a8a3e84d1a07a92

URL: https://github.com/llvm/llvm-project/commit/c8ed36281a929c34477ffc391a8a3e84d1a07a92
DIFF: https://github.com/llvm/llvm-project/commit/c8ed36281a929c34477ffc391a8a3e84d1a07a92.diff

LOG: [AMDGPU] Cast sub-dword elements to i32 in concat_vectors

This produces better code by avoiding repacking in some cases.

Fixes: SWDEV-373436

Differential Revision: https://reviews.llvm.org/D141329

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 27ea9c3913a3..8121b381e83f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1344,21 +1344,36 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
                                                   SelectionDAG &DAG) const {
   SmallVector<SDValue, 8> Args;
+  SDLoc SL(Op);
 
   EVT VT = Op.getValueType();
-  if (VT == MVT::v4i16 || VT == MVT::v4f16) {
-    SDLoc SL(Op);
-    SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
-    SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
+  if (VT.getVectorElementType().getSizeInBits() < 32) {
+    unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
+    if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
+      unsigned NewNumElt = OpBitSize / 32;
+      EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
+                                      : EVT::getVectorVT(*DAG.getContext(),
+                                                         MVT::i32, NewNumElt);
+      for (const SDUse &U : Op->ops()) {
+        SDValue In = U.get();
+        SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
+        if (NewNumElt > 1)
+          DAG.ExtractVectorElements(NewIn, Args);
+        else
+          Args.push_back(NewIn);
+      }
 
-    SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
-    return DAG.getNode(ISD::BITCAST, SL, VT, BV);
+      EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                   NewNumElt * Op.getNumOperands());
+      SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
+      return DAG.getNode(ISD::BITCAST, SL, VT, BV);
+    }
   }
 
   for (const SDUse &U : Op->ops())
     DAG.ExtractVectorElements(U.get(), Args);
 
-  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
+  return DAG.getBuildVector(Op.getValueType(), SL, Args);
 }
 
 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,

diff  --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index d0f11699c9a4..b023066b37d8 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -2273,21 +2273,12 @@ define void @shuffle_v16f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
 ; GFX9-LABEL: shuffle_v16f16_concat:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_bfi_b32 v3, s4, v9, v9
-; GFX9-NEXT:    v_bfi_b32 v2, s4, v8, v8
-; GFX9-NEXT:    v_bfi_b32 v1, s4, v7, v7
-; GFX9-NEXT:    v_bfi_b32 v0, s4, v6, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_bfi_b32 v9, s4, v13, v13
-; GFX9-NEXT:    v_bfi_b32 v8, s4, v12, v12
-; GFX9-NEXT:    v_bfi_b32 v7, s4, v11, v11
-; GFX9-NEXT:    v_bfi_b32 v6, s4, v10, v10
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
-; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2295,20 +2286,12 @@ define void @shuffle_v16f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_bfi_b32 v3, 0xffff, v9, v9
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_bfi_b32 v13, 0xffff, v13, v13
-; GFX10-NEXT:    v_bfi_b32 v12, 0xffff, v12, v12
-; GFX10-NEXT:    v_bfi_b32 v11, 0xffff, v11, v11
-; GFX10-NEXT:    v_bfi_b32 v10, 0xffff, v10, v10
-; GFX10-NEXT:    v_bfi_b32 v2, 0xffff, v8, v8
-; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v7, v7
-; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v6, v6
-; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off offset:16
-; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2316,21 +2299,12 @@ define void @shuffle_v16f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_load_b128 v[6:9], v[0:1], off
-; GFX11-NEXT:    global_load_b128 v[0:3], v[2:3], off
+; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_bfi_b32 v9, 0xffff, v9, v9
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_bfi_b32 v3, 0xffff, v3, v3
-; GFX11-NEXT:    v_bfi_b32 v2, 0xffff, v2, v2
-; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, v1, v1
-; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
-; GFX11-NEXT:    v_bfi_b32 v8, 0xffff, v8, v8
-; GFX11-NEXT:    v_bfi_b32 v7, 0xffff, v7, v7
-; GFX11-NEXT:    v_bfi_b32 v6, 0xffff, v6, v6
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off offset:16
-; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x half>, ptr addrspace(1) %arg0


        


More information about the llvm-commits mailing list