[llvm] [AMDGPU] Add intrinsic readanylane (PR #115696)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 11 00:52:22 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Xin "Russell" Liu (GinShio)
<details>
<summary>Changes</summary>
Sometimes, we know the value is uniform, but backend cannot easily prove that it is uniform.
This change introduces the intrinsic `readanylane`, which is similar to readfirstlane, but has a couple of advantages:
+ It doesn't convergent, so can be moved between control flows.
+ If the result is needed in a vgpr then the v_readfirstlane instruction can be optimized away.
CC: @<!-- -->arsenm, @<!-- -->jayfoad, @<!-- -->nhaehnle, @<!-- -->ruiling
---
Patch is 38.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115696.diff
14 Files Affected:
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+6)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+3)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp (+1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+7-2)
- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+2)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp (+2-1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+2)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td (+1)
- (modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+14-11)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+2)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+3-1)
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+9)
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ll (+492)
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ptr.ll (+126)
``````````diff
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index d6375ab77cfb32..bb7931d4a95c92 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2152,6 +2152,12 @@ def int_amdgcn_readfirstlane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+// This is similar to readfirstlane, but marks value that is uniform, allowed sunk / hoist into
+// control flow. The result is undefined if the value is actual divergent.
+def int_amdgcn_readanylane :
+ Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
+ [IntrNoCallback, IntrNoFree, IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+
// The lane argument must be uniform across the currently active threads of the
// current wave. Otherwise, the result is undefined.
def int_amdgcn_readlane :
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index e3a330d45aaa57..edd8e042d3f4b6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2775,6 +2775,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
case Intrinsic::amdgcn_strict_wqm:
Opcode = AMDGPU::STRICT_WQM;
break;
+ case Intrinsic::amdgcn_readanylane:
+ Opcode = AMDGPU::SI_READANYLANE;
+ break;
case Intrinsic::amdgcn_interp_p1_f16:
SelectInterpP1F16(N);
return;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 8beb9defee66a0..3bdc258f180f88 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1081,6 +1081,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
}
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readanylane:
case Intrinsic::amdgcn_readlane: {
// If the first argument is uniform these intrinsics return it unchanged.
const Use &Src = II.getArgOperandUse(0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index d51d136ba4200c..a5e984bde0e6c4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -25,6 +25,7 @@
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include <optional>
@@ -97,9 +98,11 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg,
bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
unsigned NewOpc) const {
+ const bool NeedExec = NewOpc != AMDGPU::SI_READANYLANE;
MI.setDesc(TII.get(NewOpc));
MI.removeOperand(1); // Remove intrinsic ID.
- MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
+ if (NeedExec)
+ MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
MachineOperand &Dst = MI.getOperand(0);
MachineOperand &Src = MI.getOperand(1);
@@ -112,7 +115,7 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
= TRI.getConstrainedRegClassForOperand(Dst, *MRI);
const TargetRegisterClass *SrcRC
= TRI.getConstrainedRegClassForOperand(Src, *MRI);
- if (!DstRC || DstRC != SrcRC)
+ if (!DstRC || (NeedExec && DstRC != SrcRC))
return false;
return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
@@ -1061,6 +1064,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
case Intrinsic::amdgcn_writelane:
return selectWritelane(I);
+ case Intrinsic::amdgcn_readanylane:
+ return constrainCopyLikeIntrin(I, AMDGPU::SI_READANYLANE);
case Intrinsic::amdgcn_div_scale:
return selectDivScale(I);
case Intrinsic::amdgcn_icmp:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 545eb9046ff030..5ff64e3be58669 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5475,6 +5475,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
switch (IID) {
case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readanylane:
case Intrinsic::amdgcn_permlane64:
return LaneOp.getReg(0);
case Intrinsic::amdgcn_readlane:
@@ -7561,6 +7562,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_writelane:
case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readanylane:
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
case Intrinsic::amdgcn_permlane64:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 6a79aa0cbf4df7..4972ccbce3618e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -137,7 +137,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
Opcode == AMDGPU::SI_TCRETURN_GFX) {
// TODO: How to use branch immediate and avoid register+add?
Opcode = AMDGPU::S_SETPC_B64;
- }
+ } else if (Opcode == AMDGPU::SI_READANYLANE)
+ Opcode = AMDGPU::V_READFIRSTLANE_B32;
int MCOpcode = TII->pseudoToMCOpcode(Opcode);
if (MCOpcode == -1) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 415c068367074f..1728876eafffcc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4658,6 +4658,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
[[fallthrough]];
}
+ case Intrinsic::amdgcn_readanylane:
+ [[fallthrough]];
case Intrinsic::amdgcn_readfirstlane: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 60fa2adc62dc8c..a36c38e105ce6e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -366,6 +366,7 @@ def UniformIntrinsics : GenericTable {
}
def : AlwaysUniform<int_amdgcn_readfirstlane>;
+def : AlwaysUniform<int_amdgcn_readanylane>;
def : AlwaysUniform<int_amdgcn_readlane>;
def : AlwaysUniform<int_amdgcn_icmp>;
def : AlwaysUniform<int_amdgcn_fcmp>;
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 73834773f66e3c..c1f35e62e633fd 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1116,18 +1116,20 @@ void SIFoldOperandsImpl::foldOperand(
unsigned UseOpc = UseMI->getOpcode();
if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
+ UseOpc == AMDGPU::SI_READANYLANE ||
(UseOpc == AMDGPU::V_READLANE_B32 &&
(int)UseOpIdx ==
- AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
+ AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
+ // readanylane doesn't care exec
+ const bool ReadAnyLean = UseOpc == AMDGPU::SI_READANYLANE;
// %vgpr = V_MOV_B32 imm
// %sgpr = V_READFIRSTLANE_B32 %vgpr
// =>
// %sgpr = S_MOV_B32 imm
if (FoldingImmLike) {
- if (execMayBeModifiedBeforeUse(*MRI,
- UseMI->getOperand(UseOpIdx).getReg(),
- *OpToFold.getParent(),
- *UseMI))
+ if (!ReadAnyLean && execMayBeModifiedBeforeUse(
+ *MRI, UseMI->getOperand(UseOpIdx).getReg(),
+ *OpToFold.getParent(), *UseMI))
return;
UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
@@ -1136,15 +1138,15 @@ void SIFoldOperandsImpl::foldOperand(
UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
else
UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
- UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
+ if (!ReadAnyLean)
+ UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
return;
}
if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
- if (execMayBeModifiedBeforeUse(*MRI,
- UseMI->getOperand(UseOpIdx).getReg(),
- *OpToFold.getParent(),
- *UseMI))
+ if (!ReadAnyLean && execMayBeModifiedBeforeUse(
+ *MRI, UseMI->getOperand(UseOpIdx).getReg(),
+ *OpToFold.getParent(), *UseMI))
return;
// %vgpr = COPY %sgpr0
@@ -1155,7 +1157,8 @@ void SIFoldOperandsImpl::foldOperand(
UseMI->getOperand(1).setReg(OpToFold.getReg());
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
UseMI->getOperand(1).setIsKill(false);
- UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
+ if (!ReadAnyLean)
+ UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
return;
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 37dc433d154f64..0acc90faa268db 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6186,6 +6186,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
Operands.push_back(Src1);
[[fallthrough]];
case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readanylane:
case Intrinsic::amdgcn_permlane64:
Operands.push_back(Src0);
break;
@@ -8837,6 +8838,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return lowerADDRSPACECAST(Op, DAG);
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readanylane:
case Intrinsic::amdgcn_writelane:
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index ad45af00f2bd75..ce5f19b2561dbe 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4159,7 +4159,8 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
- Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
+ Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR ||
+ Opcode == AMDGPU::SI_READANYLANE)
return true;
return false;
@@ -9619,6 +9620,7 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
unsigned opcode = MI.getOpcode();
if (opcode == AMDGPU::V_READLANE_B32 ||
opcode == AMDGPU::V_READFIRSTLANE_B32 ||
+ opcode == AMDGPU::SI_READANYLANE ||
opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
return InstructionUniformity::AlwaysUniform;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 25df5dabdc6aa1..575fac67288e01 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -546,6 +546,10 @@ def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
let maybeAtomic = 0;
}
+def SI_READANYLANE : SPseudoInstSI <(outs SReg_32:$dst), (ins VGPR_32:$src)> {
+ let SALU = 1;
+}
+
// Used as an isel pseudo to directly emit initialization with an
// s_mov_b32 rather than a copy of another initialized
// register. MachineCSE skips copies, and we don't want to have to
@@ -3504,6 +3508,11 @@ def : GCNPat<
(S_MOV_B32 SReg_32:$src)
>;
+def : GCNPat<
+ (i32 (int_amdgcn_readanylane (i32 imm:$src))),
+ (S_MOV_B32 SReg_32:$src)
+>;
+
multiclass BFMPatterns <ValueType vt, PatFrag SHL, PatFrag ADD, InstSI BFM> {
def : GCNPat <
(vt (SHL (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ll
new file mode 100644
index 00000000000000..0da1f47d8fe1f3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ll
@@ -0,0 +1,492 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -o - < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -global-isel -o - < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s
+
+define void @test_readanylane_i1(ptr addrspace(1) %out, i1 %src) #1 {
+; CHECK-SDAG-LABEL: test_readanylane_i1:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; CHECK-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; CHECK-SDAG-NEXT: s_and_b32 s0, s0, 1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: global_store_b8 v[0:1], v2, off
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_i1:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; CHECK-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; CHECK-GISEL-NEXT: s_and_b32 s0, s0, 1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: global_store_b8 v[0:1], v2, off
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %readanylane = call i1 @llvm.amdgcn.readanylane.i1(i1 %src)
+ store i1 %readanylane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_readanylane_i1_inreg(ptr addrspace(1) %out, i1 inreg %src) #1 {
+; CHECK-SDAG-LABEL: test_readanylane_i1_inreg:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: s_and_b32 s0, s0, 1
+; CHECK-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: global_store_b8 v[0:1], v2, off
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_i1_inreg:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: s_and_b32 s0, s0, 1
+; CHECK-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: global_store_b8 v[0:1], v2, off
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %readanylane = call i1 @llvm.amdgcn.readanylane.i1(i1 %src)
+ store i1 %readanylane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_readanylane_i1_select(ptr addrspace(1) %out, i32 %src, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readanylane_i1_select:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 42, v2
+; CHECK-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; CHECK-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s0, v4
+; CHECK-SDAG-NEXT: s_bitcmp1_b32 s0, 0
+; CHECK-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; CHECK-SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; CHECK-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_i1_select:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 42, v2
+; CHECK-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; CHECK-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s0, v4
+; CHECK-GISEL-NEXT: s_and_b32 s0, 1, s0
+; CHECK-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-GISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; CHECK-GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; CHECK-GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %cmp = icmp eq i32 %src, 42
+ %readanylane = call i1 @llvm.amdgcn.readanylane.i1(i1 %cmp)
+ %sel = select i1 %readanylane, i32 %src, i32 %src1
+ store i32 %sel, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_readanylane_i16(i16 %src) #1 {
+; CHECK-SDAG-LABEL: test_readanylane_i16:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-SDAG-NEXT: s_and_b32 s0, s0, 0xffff
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_i16:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %readanylane = call i16 @llvm.amdgcn.readanylane.i16(i16 %src)
+ call void asm sideeffect "; use $0", "s"(i16 %readanylane)
+ ret void
+}
+
+define void @test_readanylane_half(half %src) #1 {
+; CHECK-SDAG-LABEL: test_readanylane_half:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_half:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %readanylane = call half @llvm.amdgcn.readanylane.f16(half %src)
+ call void asm sideeffect "; use $0", "s"(half %readanylane)
+ ret void
+}
+
+define void @test_readanylane_float(float %src) #1 {
+; CHECK-SDAG-LABEL: test_readanylane_float:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_float:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %readanylane = call float @llvm.amdgcn.readanylane.f32(float %src)
+ call void asm sideeffect "; use $0", "s"(float %readanylane)
+ ret void
+}
+
+define void @test_readanylane_i32_immed() #1 {
+; CHECK-SDAG-LABEL: test_readanylane_i32_immed:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: s_mov_b32 s0, 42
+; CHECK-SDAG-NEXT: ;;#AS...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/115696
More information about the llvm-commits
mailing list