[llvm] [AMDGPU][GlobalISel] Properly handle lane op lowering for larger vector types (PR #132358)
Vikram Hegde via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 21 02:09:07 PDT 2025
https://github.com/vikramRH created https://github.com/llvm/llvm-project/pull/132358
Fixes https://github.com/llvm/llvm-project/issues/128650
Also adds few previously existing permlane64 tests which somehow got removed in between..
>From efb9e6a63619cd7879b6b189482b784ea6d31ae5 Mon Sep 17 00:00:00 2001
From: vikhegde <vikram.hegde at amd.com>
Date: Thu, 27 Feb 2025 16:55:17 +0530
Subject: [PATCH] [AMDGPU][GlobalISel] Properly handle lane op legalization for
larger vector types
---
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 12 +-
.../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 1028 +++++++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll | 625 +++++++++-
.../AMDGPU/llvm.amdgcn.readfirstlane.ll | 169 ++-
.../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 168 +++
.../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 795 +++++++++++++
6 files changed, 2763 insertions(+), 34 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index b3a8183beeacf..158cd1bc60f46 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5565,6 +5565,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
return false;
LLT PartialResTy = LLT::scalar(SplitSize);
+ bool NeedsBitcast = false;
if (Ty.isVector()) {
LLT EltTy = Ty.getElementType();
unsigned EltSize = EltTy.getSizeInBits();
@@ -5573,8 +5574,10 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
} else if (EltSize == 16 || EltSize == 32) {
unsigned NElem = SplitSize / EltSize;
PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
+ } else {
+ // Handle all other cases via S32/S64 pieces
+ NeedsBitcast = true;
}
- // Handle all other cases via S32/S64 pieces;
}
SmallVector<Register, 4> PartialRes;
@@ -5600,7 +5603,12 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
}
- B.buildMergeLikeInstr(DstReg, PartialRes);
+ if (NeedsBitcast)
+ B.buildBitcast(DstReg, B.buildMergeLikeInstr(
+ LLT::scalar(Ty.getSizeInBits()), PartialRes));
+ else
+ B.buildMergeLikeInstr(DstReg, PartialRes);
+
MI.eraseFromParent();
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index 076cf09678b57..65d27f97733e0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -9430,3 +9430,1031 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr
store <8 x i16> %v, ptr addrspace(1) %out
ret void
}
+
+define void @v_permlane16_v2i64(ptr addrspace(1) %out, <2 x i64> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v2i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v6
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v7
+; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlane16_v2i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v6
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v7
+; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v2i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v6
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v7
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlane16_v2i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v6
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v2i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlane16_v2i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7
+; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <2 x i64> @llvm.amdgcn.permlane16.v2i64(<2 x i64> %src0, <2 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <2 x i64> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlane16_v3i64(ptr addrspace(1) %out, <3 x i64> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v3i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v8
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v9
+; GFX10-SDAG-NEXT: v_permlane16_b32 v7, v7, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v6, v6, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlane16_v3i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v8
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v9
+; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v6, v6, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v7, v7, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v3i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v8
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v9
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlane16_v3i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v8
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v9
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v3i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlane16_v3i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v8
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v9
+; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <3 x i64> @llvm.amdgcn.permlane16.v3i64(<3 x i64> %src0, <3 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <3 x i64> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlane16_v4f64(ptr addrspace(1) %out, <4 x double> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v4f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v10
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v11
+; GFX10-SDAG-NEXT: v_permlane16_b32 v9, v9, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v8, v8, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v7, v7, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v6, v6, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlane16_v4f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v10
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v11
+; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v6, v6, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v7, v7, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v8, v8, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v9, v9, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v4f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v10
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v11
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v9, v9, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlane16_v4f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v10
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v11
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v9, v9, s0, s1
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v4f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v10
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v11
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v9, v9, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlane16_v4f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v10
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v11
+; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v9, v9, s0, s1
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <4 x double> @llvm.amdgcn.permlane16.v4f64(<4 x double> %src0, <4 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <4 x double> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlane16_v8f64(ptr addrspace(1) %out, <8 x double> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v8f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v18
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v19
+; GFX10-SDAG-NEXT: v_permlane16_b32 v17, v17, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v16, v16, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v15, v15, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v14, v14, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v13, v13, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v12, v12, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v11, v11, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v10, v10, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v9, v9, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v8, v8, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v7, v7, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v6, v6, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlane16_v8f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v18
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v19
+; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v6, v6, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v7, v7, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v8, v8, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v9, v9, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v10, v10, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v11, v11, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v12, v12, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v13, v13, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v14, v14, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v15, v15, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v16, v16, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v17, v17, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v8f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v18
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v19
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v17, v17, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v16, v16, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v15, v15, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v14, v14, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v13, v13, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v12, v12, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v11, v11, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v10, v10, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v9, v9, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: s_clause 0x3
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlane16_v8f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v18
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v19
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v9, v9, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v10, v10, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v11, v11, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v12, v12, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v13, v13, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v14, v14, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v15, v15, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v16, v16, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v17, v17, s0, s1
+; GFX11-GISEL-NEXT: s_clause 0x3
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v8f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v18
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v19
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v17, v17, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v16, v16, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v15, v15, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v14, v14, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v13, v13, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v12, v12, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v11, v11, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v10, v10, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v9, v9, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: s_clause 0x3
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlane16_v8f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v18
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v19
+; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v9, v9, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v10, v10, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v11, v11, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v12, v12, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v13, v13, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v14, v14, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v15, v15, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v16, v16, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v17, v17, s0, s1
+; GFX12-GISEL-NEXT: s_clause 0x3
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <8 x double> @llvm.amdgcn.permlane16.v8f64(<8 x double> %src0, <8 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <8 x double> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlanex16_v2i64(ptr addrspace(1) %out, <2 x i64> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_v2i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v6
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v7
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlanex16_v2i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v6
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v7
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_v2i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v6
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v7
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlanex16_v2i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v6
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_v2i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlanex16_v2i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7
+; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <2 x i64> @llvm.amdgcn.permlanex16.v2i64(<2 x i64> %src0, <2 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <2 x i64> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlanex16_v3i64(ptr addrspace(1) %out, <3 x i64> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_v3i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v8
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v9
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v7, v7, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v6, v6, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlanex16_v3i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v8
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v9
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v6, v6, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v7, v7, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_v3i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v8
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v9
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlanex16_v3i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v8
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v9
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_v3i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlanex16_v3i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v8
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v9
+; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <3 x i64> @llvm.amdgcn.permlanex16.v3i64(<3 x i64> %src0, <3 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <3 x i64> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlanex16_v4f64(ptr addrspace(1) %out, <4 x double> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_v4f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v10
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v11
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v9, v9, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v8, v8, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v7, v7, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v6, v6, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlanex16_v4f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v10
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v11
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v6, v6, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v7, v7, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v8, v8, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v9, v9, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_v4f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v10
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v11
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v9, v9, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlanex16_v4f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v10
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v11
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v9, v9, s0, s1
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_v4f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v10
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v11
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v9, v9, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlanex16_v4f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v10
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v11
+; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v9, v9, s0, s1
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <4 x double> @llvm.amdgcn.permlanex16.v4f64(<4 x double> %src0, <4 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <4 x double> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlanex16_v8f64(ptr addrspace(1) %out, <8 x double> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_v8f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v18
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v19
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v17, v17, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v16, v16, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v15, v15, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v14, v14, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v13, v13, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v12, v12, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v11, v11, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v10, v10, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v9, v9, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v8, v8, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v7, v7, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v6, v6, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlanex16_v8f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v18
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v19
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v6, v6, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v7, v7, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v8, v8, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v9, v9, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v10, v10, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v11, v11, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v12, v12, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v13, v13, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v14, v14, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v15, v15, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v16, v16, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v17, v17, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_v8f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v18
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v19
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v17, v17, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v16, v16, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v15, v15, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v14, v14, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v13, v13, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v12, v12, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v11, v11, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v10, v10, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v9, v9, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: s_clause 0x3
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlanex16_v8f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v18
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v19
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v9, v9, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v10, v10, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v11, v11, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v12, v12, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v13, v13, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v14, v14, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v15, v15, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v16, v16, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v17, v17, s0, s1
+; GFX11-GISEL-NEXT: s_clause 0x3
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_v8f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v18
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v19
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v17, v17, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v16, v16, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v15, v15, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v14, v14, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v13, v13, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v12, v12, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v11, v11, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v10, v10, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v9, v9, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: s_clause 0x3
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlanex16_v8f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v18
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v19
+; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v9, v9, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v10, v10, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v11, v11, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v12, v12, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v13, v13, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v14, v14, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v15, v15, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v16, v16, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v17, v17, s0, s1
+; GFX12-GISEL-NEXT: s_clause 0x3
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <8 x double> @llvm.amdgcn.permlanex16.v8f64(<8 x double> %src0, <8 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <8 x double> %v, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
index f23f9595446eb..6698d360aff4c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-GISEL %s
declare i32 @llvm.amdgcn.permlane64(i32)
declare i32 @llvm.amdgcn.workitem.id.x()
-define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) {
+define amdgpu_kernel void @test_s_i32(ptr addrspace(1) %out, i32 %src0) {
; GFX11-LABEL: test_s:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
@@ -17,12 +17,93 @@ define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) {
; GFX11-NEXT: v_permlane64_b32 v0, v0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
- %v = call i32 @llvm.amdgcn.permlane64(i32 %src0)
+; GFX11-SDAG-LABEL: test_s_i32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_s_i32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %src0)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @test_i(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_s_i64(ptr addrspace(1) %out, i64 %src0) {
+; GFX11-SDAG-LABEL: test_s_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2
+; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_s_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
+ %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %src0)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_s_f64(ptr addrspace(1) %out, double %src0) {
+; GFX11-SDAG-LABEL: test_s_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2
+; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_s_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
+ %v = call double @llvm.amdgcn.permlane64.f64(double %src0)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_i_i32(ptr addrspace(1) %out) {
; GFX11-LABEL: test_i:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -32,12 +113,115 @@ define amdgpu_kernel void @test_i(ptr addrspace(1) %out) {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
- %v = call i32 @llvm.amdgcn.permlane64(i32 99)
+; GFX11-SDAG-LABEL: test_i_i32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_i_i32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.permlane64.i32(i32 99)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 {
+define amdgpu_kernel void @test_i_f32(ptr addrspace(1) %out) {
+; GFX11-SDAG-LABEL: test_i_f32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0x449a5000 :: v_dual_mov_b32 v1, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_i_f32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 0x449a5000 :: v_dual_mov_b32 v1, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
+ %v = call float @llvm.amdgcn.permlane64.f32(float 1234.5)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_i_i64(ptr addrspace(1) %out) {
+; GFX11-SDAG-LABEL: test_i_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v2
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_i_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x63
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v2
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
+ %v = call i64 @llvm.amdgcn.permlane64.i64(i64 99)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_i_f64(ptr addrspace(1) %out) {
+; GFX11-SDAG-LABEL: test_i_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x40934a00
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_i_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x40934a00
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v2
+; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
+ %v = call double @llvm.amdgcn.permlane64.f64(double 1234.5)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_kernel void @test_v_i32(ptr addrspace(1) %out, i32 %src0) #1 {
; GFX11-LABEL: test_v:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -47,11 +231,430 @@ define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: test_v_i32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_v_i32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %v = call i32 @llvm.amdgcn.permlane64(i32 %tidx)
+ %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %tidx)
store i32 %v, ptr addrspace(1) %out
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX11-GISEL: {{.*}}
-; GFX11-SDAG: {{.*}}
+
+define amdgpu_kernel void @test_v_f32(ptr addrspace(1) %out, float %src0) #1 {
+; GFX11-SDAG-LABEL: test_v_f32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_v_f32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %v = call float @llvm.amdgcn.permlane64.f32(float %tidx_f32)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_v_i64(ptr addrspace(1) %out, i64 %src0) #1 {
+; GFX11-SDAG-LABEL: test_v_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v2
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_v_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v2
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_i64 = zext i32 %tidx to i64
+ %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %tidx_i64)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_v_f64(ptr addrspace(1) %out, double %src0) #1 {
+; GFX11-SDAG-LABEL: test_v_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_v_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %tidx_f64 = fpext float %tidx_f32 to double
+ %v = call double @llvm.amdgcn.permlane64.f64(double %tidx_f64)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_half(ptr addrspace(1) %out, half %src0) {
+; GFX11-SDAG-LABEL: test_half:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
+; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_half:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
+; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call half @llvm.amdgcn.permlane64.f16(half %src0)
+ store half %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_bfloat(ptr addrspace(1) %out, bfloat %src0) {
+; GFX11-SDAG-LABEL: test_bfloat:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
+; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_bfloat:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
+; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call bfloat @llvm.amdgcn.permlane64.bf16(bfloat %src0)
+ store bfloat %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_i16(ptr addrspace(1) %out, i16 %src0) {
+; GFX11-SDAG-LABEL: test_i16:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
+; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_i16:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
+; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call i16 @llvm.amdgcn.permlane64.i16(i16 %src0)
+ store i16 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_v2f16(ptr addrspace(1) %out, <2 x half> %src0) {
+; GFX11-SDAG-LABEL: test_v2f16:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
+; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_v2f16:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
+; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <2 x half> @llvm.amdgcn.permlane64.v2f16(<2 x half> %src0)
+ store <2 x half> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_v2f32(ptr addrspace(1) %out, <2 x float> %src0) {
+; GFX11-SDAG-LABEL: test_v2f32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3
+; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
+; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_v2f32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
+; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3
+; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <2 x float> @llvm.amdgcn.permlane64.v2f32(<2 x float> %src0)
+ store <2 x float> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_v7i32(ptr addrspace(1) %out, <7 x i32> %src0) {
+; GFX11-SDAG-LABEL: test_v7i32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v8, v8
+; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7
+; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6
+; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5
+; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4
+; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3
+; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_v7i32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
+; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3
+; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4
+; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5
+; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6
+; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7
+; GFX11-GISEL-NEXT: v_permlane64_b32 v8, v8
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <7 x i32> @llvm.amdgcn.permlane64.v7i32(<7 x i32> %src0)
+ store <7 x i32> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_v8i16(ptr addrspace(1) %out, <8 x i16> %src0) {
+; GFX11-SDAG-LABEL: test_v8i16:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5
+; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4
+; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3
+; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_v8i16:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
+; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3
+; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4
+; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <8 x i16> @llvm.amdgcn.permlane64.v8i16(<8 x i16> %src0)
+ store <8 x i16> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_v2i64(ptr addrspace(1) %out, <2 x i64> %src0) {
+; GFX11-SDAG-LABEL: test_v2i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5
+; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4
+; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3
+; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_v2i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
+; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3
+; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4
+; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <2 x i64> @llvm.amdgcn.permlane64.v2i64(<2 x i64> %src0)
+ store <2 x i64> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_v3i64(ptr addrspace(1) %out, <3 x i64> %src0) {
+; GFX11-SDAG-LABEL: test_v3i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7
+; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6
+; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5
+; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4
+; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3
+; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_v3i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
+; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3
+; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4
+; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5
+; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6
+; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <3 x i64> @llvm.amdgcn.permlane64.v3i64(<3 x i64> %src0)
+ store <3 x i64> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_v4f64(ptr addrspace(1) %out, <4 x double> %src0) {
+; GFX11-SDAG-LABEL: test_v4f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v9, v9
+; GFX11-SDAG-NEXT: v_permlane64_b32 v8, v8
+; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7
+; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6
+; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5
+; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4
+; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3
+; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_v4f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
+; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3
+; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4
+; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5
+; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6
+; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7
+; GFX11-GISEL-NEXT: v_permlane64_b32 v8, v8
+; GFX11-GISEL-NEXT: v_permlane64_b32 v9, v9
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <4 x double> @llvm.amdgcn.permlane64.v4f64(<4 x double> %src0)
+ store <4 x double> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_v8f64(ptr addrspace(1) %out, <8 x double> %src0) {
+; GFX11-SDAG-LABEL: test_v8f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v17, v17
+; GFX11-SDAG-NEXT: v_permlane64_b32 v16, v16
+; GFX11-SDAG-NEXT: v_permlane64_b32 v15, v15
+; GFX11-SDAG-NEXT: v_permlane64_b32 v14, v14
+; GFX11-SDAG-NEXT: v_permlane64_b32 v13, v13
+; GFX11-SDAG-NEXT: v_permlane64_b32 v12, v12
+; GFX11-SDAG-NEXT: v_permlane64_b32 v11, v11
+; GFX11-SDAG-NEXT: v_permlane64_b32 v10, v10
+; GFX11-SDAG-NEXT: v_permlane64_b32 v9, v9
+; GFX11-SDAG-NEXT: v_permlane64_b32 v8, v8
+; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7
+; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6
+; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5
+; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4
+; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3
+; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
+; GFX11-SDAG-NEXT: s_clause 0x3
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_v8f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
+; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3
+; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4
+; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5
+; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6
+; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7
+; GFX11-GISEL-NEXT: v_permlane64_b32 v8, v8
+; GFX11-GISEL-NEXT: v_permlane64_b32 v9, v9
+; GFX11-GISEL-NEXT: v_permlane64_b32 v10, v10
+; GFX11-GISEL-NEXT: v_permlane64_b32 v11, v11
+; GFX11-GISEL-NEXT: v_permlane64_b32 v12, v12
+; GFX11-GISEL-NEXT: v_permlane64_b32 v13, v13
+; GFX11-GISEL-NEXT: v_permlane64_b32 v14, v14
+; GFX11-GISEL-NEXT: v_permlane64_b32 v15, v15
+; GFX11-GISEL-NEXT: v_permlane64_b32 v16, v16
+; GFX11-GISEL-NEXT: v_permlane64_b32 v17, v17
+; GFX11-GISEL-NEXT: s_clause 0x3
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <8 x double> @llvm.amdgcn.permlane64.v8f64(<8 x double> %src0)
+ store <8 x double> %v, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 55fa02a0c582c..a369b33562d6c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -163,30 +163,157 @@ define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) {
ret void
}
-; FIXME: Broken
-; define void @test_readfirstlane_v2i64(ptr addrspace(1) %out, <2 x i64> %src) {
-; %x = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> %src)
-; call void asm sideeffect "; use $0", "s"(<2 x i64> %x)
-; ret void
-; }
+define void @test_readfirstlane_v2i64(ptr addrspace(1) %out, <2 x i64> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v2i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:7]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v2i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:7]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> %src)
+ call void asm sideeffect "; use $0", "s"(<2 x i64> %x)
+ ret void
+}
-; define void @test_readfirstlane_v3i64(ptr addrspace(1) %out, <3 x i64> %src) {
-; %x = call <3 x i64> @llvm.amdgcn.readfirstlane.v3i64(<3 x i64> %src)
-; call void asm sideeffect "; use $0", "s"(<3 x i64> %x)
-; ret void
-; }
+define void @test_readfirstlane_v3i64(ptr addrspace(1) %out, <3 x i64> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:9]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v3i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:9]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <3 x i64> @llvm.amdgcn.readfirstlane.v3i64(<3 x i64> %src)
+ call void asm sideeffect "; use $0", "s"(<3 x i64> %x)
+ ret void
+}
-; define void @test_readfirstlane_v4i64(ptr addrspace(1) %out, <4 x i64> %src) {
-; %x = call <4 x i64> @llvm.amdgcn.readfirstlane.v4i64(<4 x i64> %src)
-; call void asm sideeffect "; use $0", "s"(<4 x i64> %x)
-; ret void
-; }
+define void @test_readfirstlane_v4i64(ptr addrspace(1) %out, <4 x i64> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v4i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:11]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v4i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:11]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <4 x i64> @llvm.amdgcn.readfirstlane.v4i64(<4 x i64> %src)
+ call void asm sideeffect "; use $0", "s"(<4 x i64> %x)
+ ret void
+}
-; define void @test_readfirstlane_v8i64(ptr addrspace(1) %out, <8 x i64> %src) {
-; %x = call <8 x i64> @llvm.amdgcn.readfirstlane.v8i64(<8 x i64> %src)
-; call void asm sideeffect "; use $0", "s"(<8 x i64> %x)
-; ret void
-; }
+define void @test_readfirstlane_v8i64(ptr addrspace(1) %out, <8 x i64> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v8i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s19, v17
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s18, v16
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s17, v15
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s16, v14
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s15, v13
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s14, v12
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s13, v11
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s12, v10
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:19]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v8i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s12, v10
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s13, v11
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s14, v12
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s15, v13
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s16, v14
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s17, v15
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s18, v16
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v17
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:19]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <8 x i64> @llvm.amdgcn.readfirstlane.v8i64(<8 x i64> %src)
+ call void asm sideeffect "; use $0", "s"(<8 x i64> %x)
+ ret void
+}
define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index edb6ebcee1325..8306ef2e78b3c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -894,6 +894,174 @@ define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src
ret void
}
+define void @test_readlane_v2i64(ptr addrspace(1) %out, <2 x i64> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v2i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v6
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:7]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v2i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v6
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s7
+; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s7
+; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s7
+; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s7
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:7]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <2 x i64> @llvm.amdgcn.readlane.v2i64(<2 x i64> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<2 x i64> %x)
+ ret void
+}
+
+define void @test_readlane_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v3i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v8
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:9]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v3i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v8
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s9
+; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s9
+; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s9
+; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s9
+; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s9
+; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s9
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:9]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <3 x i64> @llvm.amdgcn.readlane.v3i64(<3 x i64> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<3 x i64> %x)
+ ret void
+}
+
+define void @test_readlane_v4f64(ptr addrspace(1) %out, <4 x double> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v4f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v10
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s11, v9, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s10, v8, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:11]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v4f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v10
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s11
+; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s11
+; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s11
+; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s11
+; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s11
+; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s11
+; CHECK-GISEL-NEXT: v_readlane_b32 s10, v8, s11
+; CHECK-GISEL-NEXT: v_readlane_b32 s11, v9, s11
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:11]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <4 x double> @llvm.amdgcn.readlane.v4f64(<4 x double> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<4 x double> %x)
+ ret void
+}
+
+define void @test_readlane_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v8f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v18
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s19, v17, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s18, v16, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s17, v15, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s16, v14, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s15, v13, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s14, v12, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s13, v11, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s12, v10, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s11, v9, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s10, v8, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:19]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v8f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v18
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s19
+; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s19
+; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s19
+; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s19
+; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s19
+; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s19
+; CHECK-GISEL-NEXT: v_readlane_b32 s10, v8, s19
+; CHECK-GISEL-NEXT: v_readlane_b32 s11, v9, s19
+; CHECK-GISEL-NEXT: v_readlane_b32 s12, v10, s19
+; CHECK-GISEL-NEXT: v_readlane_b32 s13, v11, s19
+; CHECK-GISEL-NEXT: v_readlane_b32 s14, v12, s19
+; CHECK-GISEL-NEXT: v_readlane_b32 s15, v13, s19
+; CHECK-GISEL-NEXT: v_readlane_b32 s16, v14, s19
+; CHECK-GISEL-NEXT: v_readlane_b32 s17, v15, s19
+; CHECK-GISEL-NEXT: v_readlane_b32 s18, v16, s19
+; CHECK-GISEL-NEXT: v_readlane_b32 s19, v17, s19
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:19]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <8 x double> @llvm.amdgcn.readlane.v4f64(<8 x double> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<8 x double> %x)
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 6646818b7b36f..2d80ade688f3b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -2700,6 +2700,801 @@ define void @test_writelane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %sr
ret void
}
+define void @test_writelane_v2i64(ptr addrspace(1) %out, <2 x i64> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v2i64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[7:10], v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v6
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_writelane_b32 v10, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v9, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v8, s6, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v7, s7, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v2i64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dwordx4 v[7:10], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v6
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v2
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v10, s4, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v9, s6, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v8, s7, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v7, s8, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v2i64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b128 v[7:10], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v5
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v6
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v10, s0, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v9, s2, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s3, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s4, s1
+; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[7:10], off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_v2i64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: flat_load_dwordx4 v[7:10], v[0:1]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v6
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v7, s4, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v8, s6, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v9, s7, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v10, s8, m0
+; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_v2i64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_dwordx4 v[7:10], v[0:1], off
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v6
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v4
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v5
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v7, s4, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v8, s6, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v9, s7, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v10, s8, s5
+; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_v2i64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_b128 v[7:10], v[0:1], off
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v6
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v5
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v7, s0, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v8, s2, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v9, s3, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v10, s4, s1
+; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[7:10], off
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <2 x i64>, ptr addrspace(1) %out
+ %writelane = call <2 x i64> @llvm.amdgcn.writelane.v2i64(<2 x i64> %src, i32 %src1, <2 x i64> %oldval)
+ store <2 x i64> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v3i64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_add_u32_e32 v13, vcc, 16, v0
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[9:12], v[0:1]
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: flat_load_dwordx2 v[15:16], v[13:14]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v8
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v5
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v2
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v7
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v6
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX802-SDAG-NEXT: v_writelane_b32 v12, s6, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v11, s7, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v10, s8, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v9, s9, m0
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_writelane_b32 v16, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v15, s5, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[9:12]
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[13:14], v[15:16]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v3i64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: global_load_dwordx2 v[13:14], v[0:1], off offset:16
+; GFX1010-SDAG-NEXT: global_load_dwordx4 v[9:12], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v8
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v5
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s9, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s10, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v7
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v6
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v14, s4, s5
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v12, s7, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v11, s8, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v10, s9, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v9, s10, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v13, s6, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[9:12], off
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v[0:1], v[13:14], off offset:16
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v3i64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: global_load_b64 v[13:14], v[0:1], off offset:16
+; GFX1100-SDAG-NEXT: global_load_b128 v[9:12], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v8
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v5
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s6, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v7
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v6
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v14, s0, s1
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v12, s3, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v11, s4, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v10, s5, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v9, s6, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v13, s2, s1
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[9:12], off
+; GFX1100-SDAG-NEXT: global_store_b64 v[0:1], v[13:14], off offset:16
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_v3i64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_add_u32_e32 v17, vcc, 16, v0
+; GFX802-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT: flat_load_dwordx4 v[9:12], v[0:1]
+; GFX802-GISEL-NEXT: flat_load_dwordx4 v[13:16], v[17:18]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v8
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s9, v6
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s10, v7
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1)
+; GFX802-GISEL-NEXT: v_writelane_b32 v9, s4, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v10, s6, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v11, s7, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v12, s8, m0
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v13, s9, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v14, s10, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, v13
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, v14
+; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[9:12]
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[17:18], v[2:3]
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_v3i64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: global_load_dwordx4 v[9:12], v[0:1], off
+; GFX1010-GISEL-NEXT: global_load_dwordx4 v[13:16], v[0:1], off offset:16
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v8
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v6
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v7
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s9, v4
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s10, v5
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(1)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v9, s4, s5
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v13, s7, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v14, s8, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v10, s6, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v11, s9, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v12, s10, s5
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, v13
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v3, v14
+; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[9:12], off
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:16
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_v3i64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: global_load_b128 v[9:12], v[0:1], off
+; GFX1100-GISEL-NEXT: global_load_b128 v[13:16], v[0:1], off offset:16
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v8
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v6
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v7
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s5, v4
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s6, v5
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(1)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v9, s0, s1
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v13, s3, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v14, s4, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v10, s2, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v11, s5, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v12, s6, s1
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, v13
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v3, v14
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[9:12], off
+; GFX1100-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off offset:16
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <3 x i64>, ptr addrspace(1) %out
+ %writelane = call <3 x i64> @llvm.amdgcn.writelane.v2i64(<3 x i64> %src, i32 %src1, <3 x i64> %oldval)
+ store <3 x i64> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_v4f64(ptr addrspace(1) %out, <4 x double> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v4f64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_add_u32_e32 v19, vcc, 16, v0
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[11:14], v[0:1]
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v20, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[15:18], v[19:20]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v10
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v5
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s11, v2
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v9
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v8
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v7
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v6
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX802-SDAG-NEXT: v_writelane_b32 v14, s8, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v13, s9, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v12, s10, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v11, s11, m0
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_writelane_b32 v18, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v17, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v16, s6, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v15, s7, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[11:14]
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[19:20], v[15:18]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v4f64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:16
+; GFX1010-SDAG-NEXT: global_load_dwordx4 v[15:18], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v10
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s9, v5
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s10, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s11, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s12, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v9
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v8
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v7
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v6
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v14, s4, s5
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v18, s9, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v17, s10, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v16, s11, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v15, s12, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v13, s6, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v12, s7, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v11, s8, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
+; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[11:14], off offset:16
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v4f64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: global_load_b128 v[11:14], v[0:1], off offset:16
+; GFX1100-SDAG-NEXT: global_load_b128 v[15:18], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s5, v5
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s7, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s8, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v9
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v8
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v7
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v6
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v14, s0, s1
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v18, s5, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v17, s6, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v16, s7, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v15, s8, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v13, s2, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v12, s3, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v11, s4, s1
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[15:18], off
+; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[11:14], off offset:16
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_v4f64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_add_u32_e32 v19, vcc, 16, v0
+; GFX802-GISEL-NEXT: flat_load_dwordx4 v[11:14], v[0:1]
+; GFX802-GISEL-NEXT: v_addc_u32_e32 v20, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT: flat_load_dwordx4 v[15:18], v[19:20]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v10
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s9, v6
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s10, v7
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s11, v8
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s12, v9
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1)
+; GFX802-GISEL-NEXT: v_writelane_b32 v11, s4, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v12, s6, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v13, s7, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v14, s8, m0
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v15, s9, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v16, s10, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v17, s11, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v18, s12, m0
+; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[11:14]
+; GFX802-GISEL-NEXT: flat_store_dwordx4 v[19:20], v[15:18]
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_v4f64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: global_load_dwordx4 v[11:14], v[0:1], off
+; GFX1010-GISEL-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:16
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v10
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v4
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v5
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s9, v6
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s10, v7
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s11, v8
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s12, v9
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(1)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v11, s4, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v12, s6, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v13, s7, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v14, s8, s5
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v15, s9, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v16, s10, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v17, s11, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v18, s12, s5
+; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
+; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[15:18], off offset:16
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_v4f64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: global_load_b128 v[11:14], v[0:1], off
+; GFX1100-GISEL-NEXT: global_load_b128 v[15:18], v[0:1], off offset:16
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v5
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s5, v6
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s6, v7
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s7, v8
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s8, v9
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(1)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v11, s0, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v12, s2, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v13, s3, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v14, s4, s1
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v15, s5, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v16, s6, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v17, s7, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v18, s8, s1
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[11:14], off
+; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[15:18], off offset:16
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <4 x double>, ptr addrspace(1) %out
+ %writelane = call <4 x double> @llvm.amdgcn.writelane.v4f64(<4 x double> %src, i32 %src1, <4 x double> %oldval)
+ store <4 x double> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v8f64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v18
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[18:21], v[0:1]
+; GFX802-SDAG-NEXT: v_add_u32_e32 v22, vcc, 16, v0
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v2
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v23, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[2:5], v[22:23]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v15
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s11, v14
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s12, v13
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s13, v12
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s14, v11
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s15, v10
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v17
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v16
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX802-SDAG-NEXT: v_writelane_b32 v21, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v20, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v19, s6, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v18, s7, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[18:21]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v9
+; GFX802-SDAG-NEXT: v_add_u32_e32 v18, vcc, 32, v0
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v19, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, 48, v0
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v8
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v7
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v6
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[6:9], v[0:1]
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[12:15], v[18:19]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(3)
+; GFX802-SDAG-NEXT: v_writelane_b32 v5, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v4, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v3, s6, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v2, s7, m0
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX802-SDAG-NEXT: v_writelane_b32 v9, s8, m0
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_writelane_b32 v15, s12, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v14, s13, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v13, s14, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v12, s15, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v8, s9, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v7, s10, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v6, s11, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[6:9]
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[22:23], v[2:5]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v8f64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_clause 0x3
+; GFX1010-SDAG-NEXT: global_load_dwordx4 v[19:22], v[0:1], off offset:16
+; GFX1010-SDAG-NEXT: global_load_dwordx4 v[23:26], v[0:1], off
+; GFX1010-SDAG-NEXT: global_load_dwordx4 v[27:30], v[0:1], off offset:48
+; GFX1010-SDAG-NEXT: global_load_dwordx4 v[31:34], v[0:1], off offset:32
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v18
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s17, v13
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s18, v12
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s19, v11
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s20, v10
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s13, v17
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s14, v16
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s15, v15
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s16, v14
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s9, v5
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s10, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s11, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s12, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v9
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v8
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v7
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v6
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(3)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v22, s4, s5
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(2)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v26, s9, s5
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v30, s13, s5
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v34, s17, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v33, s18, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v32, s19, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v31, s20, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v29, s14, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v28, s15, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v27, s16, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v25, s10, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v24, s11, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v23, s12, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v21, s6, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v20, s7, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v19, s8, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[31:34], off offset:32
+; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[27:30], off offset:48
+; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[23:26], off
+; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[19:22], off offset:16
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v8f64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_clause 0x3
+; GFX1100-SDAG-NEXT: global_load_b128 v[19:22], v[0:1], off offset:16
+; GFX1100-SDAG-NEXT: global_load_b128 v[23:26], v[0:1], off
+; GFX1100-SDAG-NEXT: global_load_b128 v[27:30], v[0:1], off offset:48
+; GFX1100-SDAG-NEXT: global_load_b128 v[31:34], v[0:1], off offset:32
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v18
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s13, v13
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s14, v12
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s15, v11
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s16, v10
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s9, v17
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s10, v16
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s11, v15
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s12, v14
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s5, v5
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s7, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s8, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v9
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v8
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v7
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v6
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(3)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v22, s0, s1
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(2)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v26, s5, s1
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v30, s9, s1
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v34, s13, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v33, s14, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v32, s15, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v31, s16, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v29, s10, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v28, s11, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v27, s12, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v25, s6, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v24, s7, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v23, s8, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v21, s2, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v20, s3, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v19, s4, s1
+; GFX1100-SDAG-NEXT: s_clause 0x3
+; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[31:34], off offset:32
+; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[27:30], off offset:48
+; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[23:26], off
+; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[19:22], off offset:16
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_v8f64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v18
+; GFX802-GISEL-NEXT: flat_load_dwordx4 v[18:21], v[0:1]
+; GFX802-GISEL-NEXT: v_add_u32_e32 v22, vcc, 16, v0
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5
+; GFX802-GISEL-NEXT: v_addc_u32_e32 v23, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT: flat_load_dwordx4 v[2:5], v[22:23]
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v7
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s9, v11
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s10, v12
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s11, v13
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s12, v14
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s13, v15
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s14, v16
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s15, v17
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1)
+; GFX802-GISEL-NEXT: v_writelane_b32 v18, s4, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v19, s6, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v20, s7, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v21, s8, m0
+; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[18:21]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v6
+; GFX802-GISEL-NEXT: v_add_u32_e32 v18, vcc, 32, v0
+; GFX802-GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, 48, v0
+; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v8
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v9
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v10
+; GFX802-GISEL-NEXT: flat_load_dwordx4 v[6:9], v[18:19]
+; GFX802-GISEL-NEXT: flat_load_dwordx4 v[10:13], v[0:1]
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(3)
+; GFX802-GISEL-NEXT: v_writelane_b32 v2, s4, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v3, s5, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v4, s6, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v5, s7, m0
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1)
+; GFX802-GISEL-NEXT: v_writelane_b32 v6, s8, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v7, s9, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v8, s10, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v9, s11, m0
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v10, s12, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v11, s13, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v12, s14, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v13, s15, m0
+; GFX802-GISEL-NEXT: flat_store_dwordx4 v[22:23], v[2:5]
+; GFX802-GISEL-NEXT: flat_store_dwordx4 v[18:19], v[6:9]
+; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_v8f64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_clause 0x3
+; GFX1010-GISEL-NEXT: global_load_dwordx4 v[19:22], v[0:1], off
+; GFX1010-GISEL-NEXT: global_load_dwordx4 v[23:26], v[0:1], off offset:16
+; GFX1010-GISEL-NEXT: global_load_dwordx4 v[27:30], v[0:1], off offset:32
+; GFX1010-GISEL-NEXT: global_load_dwordx4 v[31:34], v[0:1], off offset:48
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v18
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v4
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v5
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s9, v6
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s10, v7
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s11, v8
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s12, v9
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s13, v10
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s14, v11
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s15, v12
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s16, v13
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s17, v14
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s18, v15
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s19, v16
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s20, v17
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(3)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v19, s4, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v20, s6, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v21, s7, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v22, s8, s5
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(2)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v23, s9, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v24, s10, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v25, s11, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v26, s12, s5
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(1)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v27, s13, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v28, s14, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v29, s15, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v30, s16, s5
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v31, s17, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v32, s18, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v33, s19, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v34, s20, s5
+; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[19:22], off
+; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[23:26], off offset:16
+; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[27:30], off offset:32
+; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[31:34], off offset:48
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_v8f64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_clause 0x3
+; GFX1100-GISEL-NEXT: global_load_b128 v[19:22], v[0:1], off
+; GFX1100-GISEL-NEXT: global_load_b128 v[23:26], v[0:1], off offset:16
+; GFX1100-GISEL-NEXT: global_load_b128 v[27:30], v[0:1], off offset:32
+; GFX1100-GISEL-NEXT: global_load_b128 v[31:34], v[0:1], off offset:48
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v18
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v5
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s5, v6
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s6, v7
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s7, v8
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s8, v9
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s9, v10
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s10, v11
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s11, v12
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s12, v13
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s13, v14
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s14, v15
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s15, v16
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s16, v17
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(3)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v19, s0, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v20, s2, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v21, s3, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v22, s4, s1
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(2)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v23, s5, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v24, s6, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v25, s7, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v26, s8, s1
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(1)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v27, s9, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v28, s10, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v29, s11, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v30, s12, s1
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v31, s13, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v32, s14, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v33, s15, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v34, s16, s1
+; GFX1100-GISEL-NEXT: s_clause 0x3
+; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[19:22], off
+; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[23:26], off offset:16
+; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[27:30], off offset:32
+; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[31:34], off offset:48
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <8 x double>, ptr addrspace(1) %out
+ %writelane = call <8 x double> @llvm.amdgcn.writelane.v8f64(<8 x double> %src, i32 %src1, <8 x double> %oldval)
+ store <8 x double> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind readnone convergent }
More information about the llvm-commits
mailing list