[llvm] AMDGPU: Custom lower bf16 shuffles (PR #122252)

via llvm-commits llvm-commits at lists.llvm.org
Thu Jan 9 03:03:41 PST 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

<details>
<summary>Changes</summary>

We already custom lower the other 16-bit element type shuffles.

---

Patch is 142.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/122252.diff


2 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+5-5) 
- (modified) llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll (+1696-906) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0ac84f4e1f02af..992f7ed99d3bb7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -784,8 +784,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                        {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
 
     setOperationAction(ISD::VECTOR_SHUFFLE,
-                       {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
-                        MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
+                       {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
+                        MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
+                        MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
                        Custom);
 
     for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
@@ -7545,9 +7546,8 @@ SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
   SDLoc SL(Op);
   EVT ResultVT = Op.getValueType();
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
-
-  EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
-  EVT EltVT = PackVT.getVectorElementType();
+  MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
+  MVT PackVT = MVT::getVectorVT(EltVT, 2);
   int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
 
   // vector_shuffle <0,1,6,7> lhs, rhs
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index e408e83da1c298..e7ae9d831424cc 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX940 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
 
@@ -31,16 +32,27 @@ define <4 x half> @shuffle_v4f16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_234u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_234u:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GX900-NEXT:    s_waitcnt vmcnt(1)
+; GX900-NEXT:    v_mov_b32_e32 v0, v6
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_mov_b32_e32 v1, v4
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_234u:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX940-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NEXT:    v_mov_b32_e32 v0, v4
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, v6
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_234u:
 ; GFX10:       ; %bb.0:
@@ -94,13 +106,22 @@ define <4 x half> @shuffle_v4f16_u1u3(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_u3u1(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_u3u1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_u3u1:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_mov_b32_e32 v0, v2
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_u3u1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    v_mov_b32_e32 v1, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_u3u1:
 ; GFX10:       ; %bb.0:
@@ -151,16 +172,27 @@ define <4 x half> @shuffle_v4f16_u3uu(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_3u6u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v0, s4, v5, 16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_3u6u:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GX900-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GX900-NEXT:    s_waitcnt vmcnt(1)
+; GX900-NEXT:    v_alignbit_b32 v0, s4, v5, 16
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_mov_b32_e32 v1, v4
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_3u6u:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX940-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX940-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NEXT:    v_alignbit_b32 v0, s0, v5, 16
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_3u6u:
 ; GFX10:       ; %bb.0:
@@ -189,16 +221,27 @@ define <4 x half> @shuffle_v4f16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_3uu7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v0, s4, v5, 16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_3uu7:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GX900-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GX900-NEXT:    s_waitcnt vmcnt(1)
+; GX900-NEXT:    v_alignbit_b32 v0, s4, v5, 16
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_mov_b32_e32 v1, v4
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_3uu7:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX940-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX940-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NEXT:    v_alignbit_b32 v0, s0, v5, 16
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_3uu7:
 ; GFX10:       ; %bb.0:
@@ -227,16 +270,27 @@ define <4 x half> @shuffle_v4f16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_35u5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dword v4, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_35u5:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GX900-NEXT:    global_load_dword v4, v[2:3], off
+; GX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GX900-NEXT:    v_mov_b32_e32 v1, v4
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_35u5:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX940-NEXT:    global_load_dword v4, v[2:3], off
+; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_perm_b32 v0, v4, v5, s0
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_35u5:
 ; GFX10:       ; %bb.0:
@@ -263,17 +317,29 @@ define <4 x half> @shuffle_v4f16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_357u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v5, 16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v4, v6, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_357u:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GX900-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GX900-NEXT:    s_waitcnt vmcnt(1)
+; GX900-NEXT:    v_alignbit_b32 v1, s4, v5, 16
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_perm_b32 v0, v4, v6, s4
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_357u:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX940-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
+; GFX940-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NEXT:    v_alignbit_b32 v1, s0, v5, 16
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_perm_b32 v0, v4, v6, s0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_357u:
 ; GFX10:       ; %bb.0:
@@ -432,13 +498,22 @@ define <4 x half> @shuffle_v4f16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_2301(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_2301:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_2301:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_mov_b32_e32 v0, v2
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_2301:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    v_mov_b32_e32 v1, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_2301:
 ; GFX10:       ; %bb.0:
@@ -773,13 +848,22 @@ define <4 x half> @shuffle_v4f16_6723(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_6745(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_6745:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_6745:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_mov_b32_e32 v0, v2
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_6745:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    v_mov_b32_e32 v1, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_6745:
 ; GFX10:       ; %bb.0:
@@ -833,16 +917,27 @@ define <4 x half> @shuffle_v4f16_6767(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_2356:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v1, v6, v5, 16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_2356:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GX900-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GX900-NEXT:    s_waitcnt vmcnt(1)
+; GX900-NEXT:    v_alignbit_b32 v1, v6, v5, 16
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_mov_b32_e32 v0, v4
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_2356:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX940-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NEXT:    v_alignbit_b32 v1, v7, v6, 16
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_2356:
 ; GFX10:       ; %bb.0:
@@ -871,16 +966,27 @@ define <4 x half> @shuffle_v4f16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_5623:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v0, v6, v5, 16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_5623:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GX900-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GX900-NEXT:    s_waitcnt vmcnt(1)
+; GX900-NEXT:    v_alignbit_b32 v0, v6, v5, 16
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_mov_b32_e32 v1, v4
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_5623:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX940-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NEXT:    v_alignbit_b32 v0, v7, v6, 16
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_5623:
 ; GFX10:       ; %bb.0:
@@ -987,17 +1093,29 @@ define <4 x half> @shuffle_v4f16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_5734:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_alignbit_b32 v1, v4, v6, 16
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_5734:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GX900-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GX900-NEXT:    s_waitcnt vmcnt(1)
+; GX900-NEXT:    v_perm_b32 v0, v5, v4, s4
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_5734:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX940-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
+; GFX940-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NEXT:    v_perm_b32 v0, v5, v4, s0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_5734:
 ; GFX10:       ; %bb.0:
@@ -1027,16 +1145,27 @@ define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x i16> @shuffle_v4i16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4i16_2356:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v1, v6, v5, 16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4i16_2356:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GX900-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GX900-NEXT:    s_waitcnt vmcnt(1)
+; GX900-NEXT:    v_alignbit_b32 v1, v6, v5, 16
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_mov_b32_e32 v0, v4
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4i16_2356:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX940-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NEXT:    v_alignbit_b32 v1, v7, v6, 16
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4i16_2356:
 ; GFX10:       ; %bb.0:
@@ -1101,15 +1230,25 @@ define <4 x i16> @shuffle_v4i16_0167(ptr addrsp...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/122252


More information about the llvm-commits mailing list