[llvm] [AMDGPU][GlobalISel] Properly handle lane op lowering for larger vector types (PR #132358)

via llvm-commits llvm-commits at lists.llvm.org
Fri Mar 21 02:09:45 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Vikram Hegde (vikramRH)

<details>
<summary>Changes</summary>

Fixes https://github.com/llvm/llvm-project/issues/128650

Also adds few previously existing permlane64 tests which somehow got removed in between..

---

Patch is 138.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132358.diff


6 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+10-2) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll (+1028) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll (+614-11) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll (+148-21) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll (+168) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll (+795) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index b3a8183beeacf..158cd1bc60f46 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5565,6 +5565,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
     return false;
 
   LLT PartialResTy = LLT::scalar(SplitSize);
+  bool NeedsBitcast = false;
   if (Ty.isVector()) {
     LLT EltTy = Ty.getElementType();
     unsigned EltSize = EltTy.getSizeInBits();
@@ -5573,8 +5574,10 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
     } else if (EltSize == 16 || EltSize == 32) {
       unsigned NElem = SplitSize / EltSize;
       PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
+    } else {
+      // Handle all other cases via S32/S64 pieces
+      NeedsBitcast = true;
     }
-    // Handle all other cases via S32/S64 pieces;
   }
 
   SmallVector<Register, 4> PartialRes;
@@ -5600,7 +5603,12 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
     PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
   }
 
-  B.buildMergeLikeInstr(DstReg, PartialRes);
+  if (NeedsBitcast)
+    B.buildBitcast(DstReg, B.buildMergeLikeInstr(
+                               LLT::scalar(Ty.getSizeInBits()), PartialRes));
+  else
+    B.buildMergeLikeInstr(DstReg, PartialRes);
+
   MI.eraseFromParent();
   return true;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index 076cf09678b57..65d27f97733e0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -9430,3 +9430,1031 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr
   store <8 x i16> %v, ptr addrspace(1) %out
   ret void
 }
+
+define void @v_permlane16_v2i64(ptr addrspace(1) %out, <2 x i64> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v2i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v6
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlane16_v2i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v6
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v2i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlane16_v2i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v2i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlane16_v2i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX12-GISEL-NEXT:    s_wait_alu 0xf1ff
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <2 x i64> @llvm.amdgcn.permlane16.v2i64(<2 x i64> %src0, <2 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store <2 x i64> %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlane16_v3i64(ptr addrspace(1) %out, <3 x i64> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v3i64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v9
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v7, v7, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v6, v6, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlane16_v3i64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v9
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v6, v6, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v7, v7, s4, s5
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v3i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX11-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlane16_v3i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v3i64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX12-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlane16_v3i64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX12-GISEL-NEXT:    s_wait_alu 0xf1ff
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <3 x i64> @llvm.amdgcn.permlane16.v3i64(<3 x i64> %src0, <3 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store <3 x i64> %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlane16_v4f64(ptr addrspace(1) %out, <4 x double> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v4f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v10
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v11
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v9, v9, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v8, v8, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v7, v7, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v6, v6, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlane16_v4f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v10
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v11
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v6, v6, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v7, v7, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v8, v8, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v9, v9, s4, s5
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v4f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v9, v9, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v8, v8, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlane16_v4f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v8, v8, s0, s1
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v9, v9, s0, s1
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v4f64:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v9, v9, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v8, v8, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX12-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlane16_v4f64:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX12-GISEL-NEXT:    s_wait_alu 0xf1ff
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v8, v8, s0, s1
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v9, v9, s0, s1
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <4 x double> @llvm.amdgcn.permlane16.v4f64(<4 x double> %src0, <4 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store <4 x double> %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlane16_v8f64(ptr addrspace(1) %out, <8 x double> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v8f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v18
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v19
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v17, v17, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v16, v16, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v15, v15, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v14, v14, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v13, v13, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v12, v12, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v11, v11, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v10, v10, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v9, v9, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v8, v8, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v7, v7, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v6, v6, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[14:17], off offset:48
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off offset:32
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlane16_v8f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v18
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v19
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v6, v6, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v7, v7, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v8, v8, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v9, v9, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v10, v10, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v11, v11, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v12, v12, s4, s5
+; GFX10-GISEL-NEXT:    v_permlan...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/132358


More information about the llvm-commits mailing list