[llvm] [AMDGPU] Add type-generic llvm.amdgcn.readfirstlane2 intrinsic (PR #87334)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 2 04:05:13 PDT 2024
https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/87334
- **[AMDGPU] Generate readfirstlane checks**
- **[AMDGPU] Add type-generic llvm.amdgcn.readfirstlane2 intrinsic**
>From fff34d4d8a6c4e9a0e78e01852096019891fe786 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 2 Apr 2024 11:25:49 +0100
Subject: [PATCH 1/2] [AMDGPU] Generate readfirstlane checks
---
.../AMDGPU/llvm.amdgcn.readfirstlane.ll | 100 +++++++++++-------
1 file changed, 64 insertions(+), 36 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 0284f44f5f14d4..baeca3e61791bb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -1,71 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
-declare i32 @llvm.amdgcn.readfirstlane(i32) #0
-
-; CHECK-LABEL: {{^}}test_readfirstlane:
-; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
-define void @test_readfirstlane(ptr addrspace(1) %out, i32 %src) #1 {
+define void @test_readfirstlane(ptr addrspace(1) %out, i32 %src) {
+; CHECK-LABEL: test_readfirstlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-NEXT: flat_store_dword v[0:1], v2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
%readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src)
store i32 %readfirstlane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_readfirstlane_imm:
-; CHECK: s_mov_b32 [[SGPR_VAL:s[0-9]]], 32
-; CHECK-NOT: [[SGPR_VAL]]
-; CHECK: ; use [[SGPR_VAL]]
-define amdgpu_kernel void @test_readfirstlane_imm(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_imm(ptr addrspace(1) %out) {
+; CHECK-LABEL: test_readfirstlane_imm:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_mov_b32 s0, 32
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_endpgm
%readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
ret void
}
-; CHECK-LABEL: {{^}}test_readfirstlane_imm_fold:
-; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32
-; CHECK-NOT: [[VVAL]]
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
-define amdgpu_kernel void @test_readfirstlane_imm_fold(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_imm_fold(ptr addrspace(1) %out) {
+; CHECK-LABEL: test_readfirstlane_imm_fold:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v2, 32
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: flat_store_dword v[0:1], v2
+; CHECK-NEXT: s_endpgm
%readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
store i32 %readfirstlane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_readfirstlane_m0:
-; CHECK: s_mov_b32 m0, -1
-; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
-define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) {
+; CHECK-LABEL: test_readfirstlane_m0:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_mov_b32_e32 v2, m0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: flat_store_dword v[0:1], v2
+; CHECK-NEXT: s_endpgm
%m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
%readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0)
store i32 %readfirstlane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_readfirstlane_copy_from_sgpr:
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: s_mov_b32 [[SGPR:s[0-9]+]]
-; CHECK: ;;#ASMEND
-; CHECK-NOT: [[SGPR]]
-; CHECK-NOT: readfirstlane
-; CHECK: v_mov_b32_e32 [[VCOPY:v[0-9]+]], [[SGPR]]
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]]
-define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(ptr addrspace(1) %out) {
+; CHECK-LABEL: test_readfirstlane_copy_from_sgpr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: s_mov_b32 s2, 0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: flat_store_dword v[0:1], v2
+; CHECK-NEXT: s_endpgm
%sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
%readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %sgpr)
store i32 %readfirstlane, ptr addrspace(1) %out, align 4
ret void
}
-; Make sure this doesn't crash.
-; CHECK-LABEL: {{^}}test_readfirstlane_fi:
-; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 0
-define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) {
+; CHECK-LABEL: test_readfirstlane_fi:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_add_u32 s0, s0, s9
+; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s4
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_endpgm
%alloca = alloca i32, addrspace(5)
%int = ptrtoint ptr addrspace(5) %alloca to i32
%readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %int)
call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
ret void
}
-
-attributes #0 = { nounwind readnone convergent }
-attributes #1 = { nounwind }
>From 627614ed4b6ce5f973a11fff831eb2f1dab8dcfd Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 2 Apr 2024 11:18:40 +0100
Subject: [PATCH 2/2] [AMDGPU] Add type-generic llvm.amdgcn.readfirstlane2
intrinsic
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 7 ++
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 +
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 2 +
llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 6 ++
.../Target/AMDGPU/AMDGPUSearchableTables.td | 1 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 21 ++++-
llvm/lib/Target/AMDGPU/SIInstructions.td | 2 +-
llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +-
.../AMDGPU/llvm.amdgcn.readfirstlane.ll | 77 +++++++++++++++++++
9 files changed, 116 insertions(+), 3 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index bda3b066b77636..13ec0954b545ab 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2206,11 +2206,18 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
+// i32 llvm.amdgcn.readfirstlane(i32)
def int_amdgcn_readfirstlane :
ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+// ty llvm.amdgcn.readfirstlane2(ty)
+// A type-generic version of readfirstlane.
+def int_amdgcn_readfirstlane2 :
+ Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
+ [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
// The lane argument must be uniform across the currently active threads of the
// current wave. Otherwise, the result is undefined.
def int_amdgcn_readlane :
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index f283af6fa07d3e..04e8f77d2fa32d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5453,6 +5453,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(LDS)
NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
+ NODE_NAME_CASE(READFIRSTLANE)
NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
NODE_NAME_CASE(LOAD_D16_HI)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index f10a357125e562..4c22f8586ec238 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -541,6 +541,8 @@ enum NodeType : unsigned {
FPTRUNC_ROUND_UPWARD,
FPTRUNC_ROUND_DOWNWARD,
+ READFIRSTLANE,
+
DUMMY_CHAIN,
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
LOAD_D16_HI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 82f58ea38fd0a7..560474613adf36 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -342,6 +342,8 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",
def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
+def AMDGPUreadfirstlane_impl : SDNode<"AMDGPUISD::READFIRSTLANE", SDTIntUnaryOp>;
+
// SI+ export
def AMDGPUExportOp : SDTypeProfile<0, 8, [
SDTCisInt<0>, // i8 tgt
@@ -504,3 +506,7 @@ def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$vcc
def AMDGPUperm : PatFrags<(ops node:$src0, node:$src1, node:$src2),
[(int_amdgcn_perm node:$src0, node:$src1, node:$src2),
(AMDGPUperm_impl node:$src0, node:$src1, node:$src2)]>;
+
+def AMDGPUreadfirstlane : PatFrags<(ops node:$src),
+ [(int_amdgcn_readfirstlane node:$src),
+ (AMDGPUreadfirstlane_impl node:$src)]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 410dc83d45c57f..60a2127e4991dd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -393,6 +393,7 @@ def UniformIntrinsics : GenericTable {
}
def : AlwaysUniform<int_amdgcn_readfirstlane>;
+def : AlwaysUniform<int_amdgcn_readfirstlane2>;
def : AlwaysUniform<int_amdgcn_readlane>;
def : AlwaysUniform<int_amdgcn_icmp>;
def : AlwaysUniform<int_amdgcn_fcmp>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0a4370de0613b3..a2985582a89de6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -238,7 +238,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Promote);
AddPromotedToType(ISD::BUILD_VECTOR, MVT::v2bf16, MVT::v2i16);
}
-
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
@@ -8452,6 +8451,26 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
case Intrinsic::amdgcn_addrspacecast_nonnull:
return lowerADDRSPACECAST(Op, DAG);
+ case Intrinsic::amdgcn_readfirstlane2:
+ if (VT.getSizeInBits() <= 32) {
+ MVT IntVT = MVT::getIntegerVT(VT.getSizeInBits());
+ return DAG.getBitcast(
+ VT, DAG.getAnyExtOrTrunc(
+ DAG.getNode(AMDGPUISD::READFIRSTLANE, DL, MVT::i32,
+ DAG.getAnyExtOrTrunc(
+ DAG.getBitcast(IntVT, Op.getOperand(1)), DL,
+ MVT::i32)),
+ DL, IntVT));
+ }
+ if (VT.getSizeInBits() % 32 == 0) {
+ MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
+ return DAG.getBitcast(
+ VT, DAG.UnrollVectorOp(
+ DAG.getNode(AMDGPUISD::READFIRSTLANE, DL, VecVT,
+ DAG.getBitcast(VecVT, Op.getOperand(1)))
+ .getNode()));
+ }
+ return SDValue();
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 1c942dcefdacea..8ec74354fafa68 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3405,7 +3405,7 @@ def : GCNPat<
// FIXME: Should also do this for readlane, but tablegen crashes on
// the ignored src1.
def : GCNPat<
- (int_amdgcn_readfirstlane (i32 imm:$src)),
+ (AMDGPUreadfirstlane (i32 imm:$src)),
(S_MOV_B32 SReg_32:$src)
>;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 2341e0d9d32bb4..809cdde14bbe39 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -243,7 +243,7 @@ def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
// FIXME: Specify SchedRW for READFIRSTLANE_B32
// TODO: There is VOP3 encoding also
def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE,
- getVOP1Pat<int_amdgcn_readfirstlane,
+ getVOP1Pat<AMDGPUreadfirstlane,
VOP_READFIRSTLANE>.ret, 1> {
let isConvergent = 1;
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index baeca3e61791bb..10b275cb7745bf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -97,3 +97,80 @@ define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) {
call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
ret void
}
+
+define void @test_readfirstlane2_i32(ptr addrspace(1) %out, i32 %src) {
+; CHECK-LABEL: test_readfirstlane2_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s4
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %x = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 %src)
+ call void asm sideeffect "; use $0", "s"(i32 %x)
+ ret void
+}
+
+define void @test_readfirstlane2_i64(ptr addrspace(1) %out, i64 %src) {
+; CHECK-LABEL: test_readfirstlane2_i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %x = call i64 @llvm.amdgcn.readfirstlane2.i64(i64 %src)
+ call void asm sideeffect "; use $0", "s"(i64 %x)
+ ret void
+}
+
+define void @test_readfirstlane2_v7i32(ptr addrspace(1) %out, <7 x i32> %src) {
+; CHECK-LABEL: test_readfirstlane2_v7i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_readfirstlane_b32 s10, v8
+; CHECK-NEXT: v_readfirstlane_b32 s9, v7
+; CHECK-NEXT: v_readfirstlane_b32 s8, v6
+; CHECK-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[4:10]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %x = call <7 x i32> @llvm.amdgcn.readfirstlane2.v7i32(<7 x i32> %src)
+ call void asm sideeffect "; use $0", "s"(<7 x i32> %x)
+ ret void
+}
+
+define void @test_readfirstlane2_f16(ptr addrspace(1) %out, half %src) {
+; CHECK-LABEL: test_readfirstlane2_f16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s4
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %x = call half @llvm.amdgcn.readfirstlane2.f16(half %src)
+ call void asm sideeffect "; use $0", "s"(half %x)
+ ret void
+}
+
+define void @test_readfirstlane2_float(ptr addrspace(1) %out, float %src) {
+; CHECK-LABEL: test_readfirstlane2_float:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s4
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %x = call float @llvm.amdgcn.readfirstlane2.f32(float %src)
+ call void asm sideeffect "; use $0", "s"(float %x)
+ ret void
+}
More information about the llvm-commits
mailing list