[llvm] [AMDGPU] Add intrinsic readanylane (PR #115696)

Xin Russell Liu via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 14 02:40:34 PST 2024


https://github.com/GinShio updated https://github.com/llvm/llvm-project/pull/115696

>From ca51a8a7591dd8d9053348857925e4f747194cab Mon Sep 17 00:00:00 2001
From: Russell Liu <Xin.Liu2 at amd.com>
Date: Mon, 4 Nov 2024 14:25:33 +0800
Subject: [PATCH 1/4] [AMDGPU] Add intrinsic readanylane

Sometimes, we know the value is uniform, but backend cannot easily prove
that it is uniform.

This change introduces the intrinsic `readanylane`, which is similar to
readfirstlane, but has a couple of advantages:
 + It doesn't convergent, so can be moved between control flows.
 + If the result is needed in a vgpr then the v_readfirstlane
   instruction can be optimized away.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   6 +
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |   3 +
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     |   1 +
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |   9 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   2 +
 llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp  |   3 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   2 +
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |   1 +
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp     |  25 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   2 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |   4 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   9 +
 .../CodeGen/AMDGPU/llvm.amdgcn.readanylane.ll | 492 ++++++++++++++++++
 .../AMDGPU/llvm.amdgcn.readanylane.ptr.ll     | 126 +++++
 14 files changed, 670 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ptr.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index d6375ab77cfb32..bb7931d4a95c92 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2152,6 +2152,12 @@ def int_amdgcn_readfirstlane :
   Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
+// This is similar to readfirstlane, but marks value that is uniform, allowed sunk / hoist into
+// control flow. The result is undefined if the value is actual divergent.
+def int_amdgcn_readanylane :
+  Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
+            [IntrNoCallback, IntrNoFree, IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+
 // The lane argument must be uniform across the currently active threads of the
 // current wave. Otherwise, the result is undefined.
 def int_amdgcn_readlane :
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index e3a330d45aaa57..edd8e042d3f4b6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2775,6 +2775,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
   case Intrinsic::amdgcn_strict_wqm:
     Opcode = AMDGPU::STRICT_WQM;
     break;
+  case Intrinsic::amdgcn_readanylane:
+    Opcode = AMDGPU::SI_READANYLANE;
+    break;
   case Intrinsic::amdgcn_interp_p1_f16:
     SelectInterpP1F16(N);
     return;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 8beb9defee66a0..3bdc258f180f88 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1081,6 +1081,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
   }
   case Intrinsic::amdgcn_permlane64:
   case Intrinsic::amdgcn_readfirstlane:
+  case Intrinsic::amdgcn_readanylane:
   case Intrinsic::amdgcn_readlane: {
     // If the first argument is uniform these intrinsics return it unchanged.
     const Use &Src = II.getArgOperandUse(0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index d51d136ba4200c..a5e984bde0e6c4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include <optional>
@@ -97,9 +98,11 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg,
 
 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
                                                         unsigned NewOpc) const {
+  const bool NeedExec = NewOpc != AMDGPU::SI_READANYLANE;
   MI.setDesc(TII.get(NewOpc));
   MI.removeOperand(1); // Remove intrinsic ID.
-  MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
+  if (NeedExec)
+    MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
 
   MachineOperand &Dst = MI.getOperand(0);
   MachineOperand &Src = MI.getOperand(1);
@@ -112,7 +115,7 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
   const TargetRegisterClass *SrcRC
     = TRI.getConstrainedRegClassForOperand(Src, *MRI);
-  if (!DstRC || DstRC != SrcRC)
+  if (!DstRC || (NeedExec && DstRC != SrcRC))
     return false;
 
   return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
@@ -1061,6 +1064,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
     return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
   case Intrinsic::amdgcn_writelane:
     return selectWritelane(I);
+  case Intrinsic::amdgcn_readanylane:
+    return constrainCopyLikeIntrin(I, AMDGPU::SI_READANYLANE);
   case Intrinsic::amdgcn_div_scale:
     return selectDivScale(I);
   case Intrinsic::amdgcn_icmp:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 545eb9046ff030..5ff64e3be58669 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5475,6 +5475,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
     auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
     switch (IID) {
     case Intrinsic::amdgcn_readfirstlane:
+    case Intrinsic::amdgcn_readanylane:
     case Intrinsic::amdgcn_permlane64:
       return LaneOp.getReg(0);
     case Intrinsic::amdgcn_readlane:
@@ -7561,6 +7562,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_readlane:
   case Intrinsic::amdgcn_writelane:
   case Intrinsic::amdgcn_readfirstlane:
+  case Intrinsic::amdgcn_readanylane:
   case Intrinsic::amdgcn_permlane16:
   case Intrinsic::amdgcn_permlanex16:
   case Intrinsic::amdgcn_permlane64:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 6a79aa0cbf4df7..4972ccbce3618e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -137,7 +137,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
              Opcode == AMDGPU::SI_TCRETURN_GFX) {
     // TODO: How to use branch immediate and avoid register+add?
     Opcode = AMDGPU::S_SETPC_B64;
-  }
+  } else if (Opcode == AMDGPU::SI_READANYLANE)
+    Opcode = AMDGPU::V_READFIRSTLANE_B32;
 
   int MCOpcode = TII->pseudoToMCOpcode(Opcode);
   if (MCOpcode == -1) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 415c068367074f..1728876eafffcc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4658,6 +4658,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
       [[fallthrough]];
     }
+    case Intrinsic::amdgcn_readanylane:
+      [[fallthrough]];
     case Intrinsic::amdgcn_readfirstlane: {
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 60fa2adc62dc8c..a36c38e105ce6e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -366,6 +366,7 @@ def UniformIntrinsics : GenericTable {
 }
 
 def : AlwaysUniform<int_amdgcn_readfirstlane>;
+def : AlwaysUniform<int_amdgcn_readanylane>;
 def : AlwaysUniform<int_amdgcn_readlane>;
 def : AlwaysUniform<int_amdgcn_icmp>;
 def : AlwaysUniform<int_amdgcn_fcmp>;
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 73834773f66e3c..c1f35e62e633fd 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1116,18 +1116,20 @@ void SIFoldOperandsImpl::foldOperand(
 
     unsigned UseOpc = UseMI->getOpcode();
     if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
+        UseOpc == AMDGPU::SI_READANYLANE ||
         (UseOpc == AMDGPU::V_READLANE_B32 &&
          (int)UseOpIdx ==
-         AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
+             AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
+      // readanylane doesn't care exec
+      const bool ReadAnyLean = UseOpc == AMDGPU::SI_READANYLANE;
       // %vgpr = V_MOV_B32 imm
       // %sgpr = V_READFIRSTLANE_B32 %vgpr
       // =>
       // %sgpr = S_MOV_B32 imm
       if (FoldingImmLike) {
-        if (execMayBeModifiedBeforeUse(*MRI,
-                                       UseMI->getOperand(UseOpIdx).getReg(),
-                                       *OpToFold.getParent(),
-                                       *UseMI))
+        if (!ReadAnyLean && execMayBeModifiedBeforeUse(
+                                *MRI, UseMI->getOperand(UseOpIdx).getReg(),
+                                *OpToFold.getParent(), *UseMI))
           return;
 
         UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
@@ -1136,15 +1138,15 @@ void SIFoldOperandsImpl::foldOperand(
           UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
         else
           UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
-        UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
+        if (!ReadAnyLean)
+          UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
         return;
       }
 
       if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
-        if (execMayBeModifiedBeforeUse(*MRI,
-                                       UseMI->getOperand(UseOpIdx).getReg(),
-                                       *OpToFold.getParent(),
-                                       *UseMI))
+        if (!ReadAnyLean && execMayBeModifiedBeforeUse(
+                                *MRI, UseMI->getOperand(UseOpIdx).getReg(),
+                                *OpToFold.getParent(), *UseMI))
           return;
 
         // %vgpr = COPY %sgpr0
@@ -1155,7 +1157,8 @@ void SIFoldOperandsImpl::foldOperand(
         UseMI->getOperand(1).setReg(OpToFold.getReg());
         UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
         UseMI->getOperand(1).setIsKill(false);
-        UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
+        if (!ReadAnyLean)
+          UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
         return;
       }
     }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 37dc433d154f64..0acc90faa268db 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6186,6 +6186,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
       Operands.push_back(Src1);
       [[fallthrough]];
     case Intrinsic::amdgcn_readfirstlane:
+    case Intrinsic::amdgcn_readanylane:
     case Intrinsic::amdgcn_permlane64:
       Operands.push_back(Src0);
       break;
@@ -8837,6 +8838,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return lowerADDRSPACECAST(Op, DAG);
   case Intrinsic::amdgcn_readlane:
   case Intrinsic::amdgcn_readfirstlane:
+  case Intrinsic::amdgcn_readanylane:
   case Intrinsic::amdgcn_writelane:
   case Intrinsic::amdgcn_permlane16:
   case Intrinsic::amdgcn_permlanex16:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index ad45af00f2bd75..ce5f19b2561dbe 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4159,7 +4159,8 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
   if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
       Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
       Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
-      Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
+      Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR ||
+      Opcode == AMDGPU::SI_READANYLANE)
     return true;
 
   return false;
@@ -9619,6 +9620,7 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
   unsigned opcode = MI.getOpcode();
   if (opcode == AMDGPU::V_READLANE_B32 ||
       opcode == AMDGPU::V_READFIRSTLANE_B32 ||
+      opcode == AMDGPU::SI_READANYLANE ||
       opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
     return InstructionUniformity::AlwaysUniform;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 25df5dabdc6aa1..575fac67288e01 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -546,6 +546,10 @@ def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
   let maybeAtomic = 0;
 }
 
+def SI_READANYLANE : SPseudoInstSI <(outs SReg_32:$dst), (ins VGPR_32:$src)> {
+  let SALU = 1;
+}
+
 // Used as an isel pseudo to directly emit initialization with an
 // s_mov_b32 rather than a copy of another initialized
 // register. MachineCSE skips copies, and we don't want to have to
@@ -3504,6 +3508,11 @@ def : GCNPat<
   (S_MOV_B32 SReg_32:$src)
 >;
 
+def : GCNPat<
+  (i32 (int_amdgcn_readanylane (i32 imm:$src))),
+  (S_MOV_B32 SReg_32:$src)
+>;
+
 multiclass BFMPatterns <ValueType vt, PatFrag SHL, PatFrag ADD, InstSI BFM> {
   def : GCNPat <
     (vt (SHL (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ll
new file mode 100644
index 00000000000000..0da1f47d8fe1f3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ll
@@ -0,0 +1,492 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -o - < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -global-isel -o - < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s
+
+define void @test_readanylane_i1(ptr addrspace(1) %out, i1 %src) #1 {
+; CHECK-SDAG-LABEL: test_readanylane_i1:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; CHECK-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; CHECK-SDAG-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    global_store_b8 v[0:1], v2, off
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_i1:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; CHECK-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; CHECK-GISEL-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    global_store_b8 v[0:1], v2, off
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %readanylane = call i1 @llvm.amdgcn.readanylane.i1(i1 %src)
+  store i1 %readanylane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_readanylane_i1_inreg(ptr addrspace(1) %out, i1 inreg %src) #1 {
+; CHECK-SDAG-LABEL: test_readanylane_i1_inreg:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    global_store_b8 v[0:1], v2, off
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_i1_inreg:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    global_store_b8 v[0:1], v2, off
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %readanylane = call i1 @llvm.amdgcn.readanylane.i1(i1 %src)
+  store i1 %readanylane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_readanylane_i1_select(ptr addrspace(1) %out, i32 %src, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readanylane_i1_select:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 42, v2
+; CHECK-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; CHECK-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; CHECK-SDAG-NEXT:    s_bitcmp1_b32 s0, 0
+; CHECK-SDAG-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; CHECK-SDAG-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; CHECK-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_i1_select:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 42, v2
+; CHECK-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; CHECK-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s0, v4
+; CHECK-GISEL-NEXT:    s_and_b32 s0, 1, s0
+; CHECK-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-GISEL-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; CHECK-GISEL-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; CHECK-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq i32 %src, 42
+  %readanylane = call i1 @llvm.amdgcn.readanylane.i1(i1 %cmp)
+  %sel = select i1 %readanylane, i32 %src, i32 %src1
+  store i32 %sel, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_readanylane_i16(i16 %src) #1 {
+; CHECK-SDAG-LABEL: test_readanylane_i16:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-SDAG-NEXT:    s_and_b32 s0, s0, 0xffff
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_i16:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %readanylane = call i16 @llvm.amdgcn.readanylane.i16(i16 %src)
+  call void asm sideeffect "; use $0", "s"(i16 %readanylane)
+  ret void
+}
+
+define void @test_readanylane_half(half %src) #1 {
+; CHECK-SDAG-LABEL: test_readanylane_half:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_half:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %readanylane = call half @llvm.amdgcn.readanylane.f16(half %src)
+  call void asm sideeffect "; use $0", "s"(half %readanylane)
+  ret void
+}
+
+define void @test_readanylane_float(float %src) #1 {
+; CHECK-SDAG-LABEL: test_readanylane_float:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_float:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %readanylane = call float @llvm.amdgcn.readanylane.f32(float %src)
+  call void asm sideeffect "; use $0", "s"(float %readanylane)
+  ret void
+}
+
+define void @test_readanylane_i32_immed() #1 {
+; CHECK-SDAG-LABEL: test_readanylane_i32_immed:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    s_mov_b32 s0, 42
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_i32_immed:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    s_mov_b32 s0, 42
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %readanylane = call i32 @llvm.amdgcn.readanylane.i32(i32 42)
+  call void asm sideeffect "; use $0", "s"(i32 %readanylane)
+  ret void
+}
+
+define void @test_readanylane_i32_inreg(i32 inreg %src) #1 {
+; CHECK-SDAG-LABEL: test_readanylane_i32_inreg:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_i32_inreg:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %readanylane = call i32 @llvm.amdgcn.readanylane.i32(i32 %src)
+  call void asm sideeffect "; use $0", "s"(i32 %readanylane)
+  ret void
+}
+
+define void @test_readanylane_i64(i64 %src) #1 {
+; CHECK-SDAG-LABEL: test_readanylane_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %readanylane = call i64 @llvm.amdgcn.readanylane.i64(i64 %src)
+  call void asm sideeffect "; use $0", "s"(i64 %readanylane)
+  ret void
+}
+
+define void @test_readanylane_f64_immed() #1 {
+; CHECK-SDAG-LABEL: test_readanylane_f64_immed:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    s_mov_b64 s[0:1], 1.0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_f64_immed:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    s_mov_b64 s[0:1], 1.0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %readanylane = call double @llvm.amdgcn.readanylane.f64(double 1.0)
+  call void asm sideeffect "; use $0", "s"(double %readanylane)
+  ret void
+}
+
+define void @test_readanylane_m0() #1 {
+; CHECK-SDAG-LABEL: test_readanylane_m0:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    s_mov_b32 m0, -1
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use m0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_m0:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    s_mov_b32 m0, -1
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use m0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
+  %readanylane = call i32 @llvm.amdgcn.readanylane.i32(i32 %m0)
+  call void asm sideeffect "; use $0", "s"(i32 %readanylane)
+  ret void
+}
+
+define void @test_readanylane_v7i32(<7 x i32> %src) #1 {
+; CHECK-SDAG-LABEL: test_readanylane_v7i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s3, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s2, v2
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:6]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_v7i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s2, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s3, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v4
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v5
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v6
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[0:6]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %readanylane = call <7 x i32> @llvm.amdgcn.readanylane.v7i32(<7 x i32> %src)
+  call void asm sideeffect "; use $0", "s"(<7 x i32> %readanylane)
+  ret void
+}
+
+define void @test_readanylane_v8f16(<8 x half> %src) #1 {
+; CHECK-SDAG-LABEL: test_readanylane_v8f16:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s3, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s2, v2
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:3]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_v8f16:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s2, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s3, v3
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[0:3]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %readanylane = call <8 x half> @llvm.amdgcn.readanylane.v8f16(<8 x half> %src)
+  call void asm sideeffect "; use $0", "s"(<8 x half> %readanylane)
+  ret void
+}
+
+define amdgpu_kernel void @test_readanylane_alloc() #1 {
+; CHECK-SDAG-LABEL: test_readanylane_alloc:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_mov_b32 s0, 0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readanylane_alloc:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_mov_b32 s0, 0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
+  %alloca = alloca i32, addrspace(5)
+  %intptr = ptrtoint ptr addrspace(5) %alloca to i32
+  %readanylane = call i32 @llvm.amdgcn.readanylane.i32(i32 %intptr)
+  call void asm sideeffect "; use $0", "s"(i32 %readanylane)
+  ret void
+}
+
+define void @test_readanylane_hoist(i1 %cond, i32 %src) #1 {
+; CHECK-SDAG-LABEL: test_readanylane_hoist:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-SDAG-NEXT:    ; implicit-def: $sgpr0
+; CHECK-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; CHECK-SDAG-NEXT:    s_xor_b32 s1, vcc_lo, -1
+; CHECK-SDAG-NEXT:    s_and_saveexec_b32 s2, s1
+; CHECK-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-SDAG-NEXT:    s_xor_b32 s1, exec_lo, s2
+; CHECK-SDAG-NEXT:    s_cbranch_execz .LBB14_2
+; CHECK-SDAG-NEXT:  ; %bb.1: ; %.else
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v1
+; CHECK-SDAG-NEXT:    ; implicit-def: $vgpr1
+; CHECK-SDAG-NEXT:  .LBB14_2: ; %Flow
+; CHECK-SDAG-NEXT:    s_or_saveexec_b32 s1, s1
+; CHECK-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s1
+; CHECK-SDAG-NEXT:    s_cbranch_execz .LBB14_4
+; CHECK-SDAG-NEXT:  ; %bb.3: ; %.then
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v1
+; CHECK-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:  .LBB14_4: ; %.endif
+; CHECK-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use v0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_hoist:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-GISEL-NEXT:    ; implicit-def: $sgpr0
+; CHECK-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-GISEL-NEXT:    s_xor_b32 s1, vcc_lo, -1
+; CHECK-GISEL-NEXT:    s_and_saveexec_b32 s2, s1
+; CHECK-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-GISEL-NEXT:    s_xor_b32 s1, exec_lo, s2
+; CHECK-GISEL-NEXT:    s_cbranch_execz .LBB14_2
+; CHECK-GISEL-NEXT:  ; %bb.1: ; %.else
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s0, v1
+; CHECK-GISEL-NEXT:    ; implicit-def: $vgpr1
+; CHECK-GISEL-NEXT:  .LBB14_2: ; %Flow
+; CHECK-GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
+; CHECK-GISEL-NEXT:    s_cbranch_execz .LBB14_4
+; CHECK-GISEL-NEXT:  ; %bb.3: ; %.then
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s0, v1
+; CHECK-GISEL-NEXT:  .LBB14_4: ; %.endif
+; CHECK-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  br i1 %cond, label %.then, label %.else
+.then:
+  %uni.then = call i32 @llvm.amdgcn.readanylane.i32(i32 %src)
+  br label %.endif
+.else:
+  %uni.else = call i32 @llvm.amdgcn.readanylane.i32(i32 %src)
+  br label %.endif
+.endif:
+  %readanylane = phi i32 [ %uni.then, %.then ], [ %uni.else, %.else ]
+  call void asm sideeffect "; use $0", "s"(i32 %readanylane)
+  ret void
+}
+
+define void @test_readanylane_suck(i1 %cond, i32 %src) #1 {
+; CHECK-SDAG-LABEL: test_readanylane_suck:
+; CHECK-SDAG:       ; %bb.0: ; %.entry
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; CHECK-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-SDAG-NEXT:    s_xor_b32 s1, vcc_lo, -1
+; CHECK-SDAG-NEXT:    s_and_saveexec_b32 s0, s1
+; CHECK-SDAG-NEXT:    s_cbranch_execz .LBB15_2
+; CHECK-SDAG-NEXT:  ; %bb.1: ; %.else
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; CHECK-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; CHECK-SDAG-NEXT:    s_add_i32 s1, s1, 42
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; CHECK-SDAG-NEXT:  .LBB15_2: ; %.endif
+; CHECK-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use v0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readanylane_suck:
+; CHECK-GISEL:       ; %bb.0: ; %.entry
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-GISEL-NEXT:    s_mov_b32 s0, 0
+; CHECK-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-GISEL-NEXT:    s_xor_b32 s2, vcc_lo, -1
+; CHECK-GISEL-NEXT:    s_and_saveexec_b32 s1, s2
+; CHECK-GISEL-NEXT:    s_cbranch_execz .LBB15_2
+; CHECK-GISEL-NEXT:  ; %bb.1: ; %.else
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s0, v1
+; CHECK-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-GISEL-NEXT:    s_add_i32 s0, s0, 42
+; CHECK-GISEL-NEXT:  .LBB15_2: ; %.endif
+; CHECK-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %uni = call i32 @llvm.amdgcn.readanylane.i32(i32 %src)
+  br i1 %cond, label %.endif, label %.else
+.else:
+  %uni.else = add nuw nsw i32 %uni, 42
+  br label %.endif
+.endif:
+  %readanylane = phi i32 [ 0, %.entry ], [ %uni.else, %.else ]
+  call void asm sideeffect "; use $0", "s"(i32 %readanylane)
+  ret void
+}
+
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ptr.ll
new file mode 100644
index 00000000000000..f5110a48c0b0c9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ptr.ll
@@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -o - < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
+
+define void @test_readanylane_p0(ptr addrspace(1) %out, ptr %src) {
+; CHECK-SDAG-LABEL: test_readanylane_p0:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %readanylane = call ptr @llvm.amdgcn.readanylane.p0(ptr %src)
+  call void asm sideeffect "; use $0", "s"(ptr %readanylane)
+  ret void
+}
+
+define void @test_readanylane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src) {
+; CHECK-SDAG-LABEL: test_readanylane_v3p0:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v7
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v6
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s3, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s2, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:5]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %readanylane = call <3 x ptr> @llvm.amdgcn.readanylane.v3p0(<3 x ptr> %src)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr> %readanylane)
+  ret void
+}
+
+define void @test_readanylane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src) {
+; CHECK-SDAG-LABEL: test_readanylane_p3:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %readanylane = call ptr addrspace(3) @llvm.amdgcn.readanylane.p3(ptr addrspace(3) %src)
+  call void asm sideeffect "; use $0", "s"(ptr addrspace(3) %readanylane)
+  ret void
+}
+
+define void @test_readanylane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src) {
+; CHECK-SDAG-LABEL: test_readanylane_v3p3:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s2, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:2]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %readanylane = call <3 x ptr addrspace(3)> @llvm.amdgcn.readanylane.v3p3(<3 x ptr addrspace(3)> %src)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(3)> %readanylane)
+  ret void
+}
+
+define void @test_readanylane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src) {
+; CHECK-SDAG-LABEL: test_readanylane_p5:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %readanylane = call ptr addrspace(5) @llvm.amdgcn.readanylane.p5(ptr addrspace(5) %src)
+  call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %readanylane)
+  ret void
+}
+
+define void @test_readanylane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src) {
+; CHECK-SDAG-LABEL: test_readanylane_v3p5:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s2, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:2]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %readanylane = call <3 x ptr addrspace(5)> @llvm.amdgcn.readanylane.v3p5(<3 x ptr addrspace(5)> %src)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(5)> %readanylane)
+  ret void
+}
+
+define void @test_readanylane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src) {
+; CHECK-SDAG-LABEL: test_readanylane_p6:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %readanylane = call ptr addrspace(6) @llvm.amdgcn.readanylane.p6(ptr addrspace(6) %src)
+  call void asm sideeffect "; use $0", "s"(ptr addrspace(6) %readanylane)
+  ret void
+}
+
+define void @test_readanylane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src) {
+; CHECK-SDAG-LABEL: test_readanylane_v3p6:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s2, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:2]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %readanylane = call <3 x ptr addrspace(6)> @llvm.amdgcn.readanylane.v3p6(<3 x ptr addrspace(6)> %src)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(6)> %readanylane)
+  ret void
+}

>From 6c7fef7c2b896678dbc821d85ee5cb6c126e3aec Mon Sep 17 00:00:00 2001
From: Russell Liu <Xin.Liu2 at amd.com>
Date: Mon, 11 Nov 2024 21:10:45 +0800
Subject: [PATCH 2/4] [AMDGPU] Add attribute convergent for readanylane

New intrinsic has to be convergent. It\'s uniform uniform at the place
where it is used.

This commit changes:
 + Add attribute `convergent`.
 + Remove testcase `test_readanylane_hoist`. The intrinsic is no longer
   allowed hoist.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   5 +-
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp     |  23 ++--
 .../CodeGen/AMDGPU/llvm.amdgcn.readanylane.ll | 103 +++---------------
 3 files changed, 29 insertions(+), 102 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index bb7931d4a95c92..03ad739c8bc1f5 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2152,11 +2152,12 @@ def int_amdgcn_readfirstlane :
   Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
-// This is similar to readfirstlane, but marks value that is uniform, allowed sunk / hoist into
+// This is similar to readfirstlane, but marks value that is uniform, allowed sunk into
 // control flow. The result is undefined if the value is actual divergent.
 def int_amdgcn_readanylane :
   Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
-            [IntrNoCallback, IntrNoFree, IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+            [IntrConvergent, IntrNoCallback, IntrNoFree, IntrNoMem, IntrSpeculatable,
+             IntrWillReturn]>;
 
 // The lane argument must be uniform across the currently active threads of the
 // current wave. Otherwise, the result is undefined.
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index c1f35e62e633fd..2121548c4ffe42 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1119,17 +1119,17 @@ void SIFoldOperandsImpl::foldOperand(
         UseOpc == AMDGPU::SI_READANYLANE ||
         (UseOpc == AMDGPU::V_READLANE_B32 &&
          (int)UseOpIdx ==
-             AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
-      // readanylane doesn't care exec
-      const bool ReadAnyLean = UseOpc == AMDGPU::SI_READANYLANE;
+         AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
+      const bool NeedExec = UseOpc != AMDGPU::SI_READANYLANE;
       // %vgpr = V_MOV_B32 imm
       // %sgpr = V_READFIRSTLANE_B32 %vgpr
       // =>
       // %sgpr = S_MOV_B32 imm
       if (FoldingImmLike) {
-        if (!ReadAnyLean && execMayBeModifiedBeforeUse(
-                                *MRI, UseMI->getOperand(UseOpIdx).getReg(),
-                                *OpToFold.getParent(), *UseMI))
+        if (execMayBeModifiedBeforeUse(*MRI,
+                                       UseMI->getOperand(UseOpIdx).getReg(),
+                                       *OpToFold.getParent(),
+                                       *UseMI))
           return;
 
         UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
@@ -1138,15 +1138,16 @@ void SIFoldOperandsImpl::foldOperand(
           UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
         else
           UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
-        if (!ReadAnyLean)
+        if (NeedExec)
           UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
         return;
       }
 
       if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
-        if (!ReadAnyLean && execMayBeModifiedBeforeUse(
-                                *MRI, UseMI->getOperand(UseOpIdx).getReg(),
-                                *OpToFold.getParent(), *UseMI))
+        if (execMayBeModifiedBeforeUse(*MRI,
+                                       UseMI->getOperand(UseOpIdx).getReg(),
+                                       *OpToFold.getParent(),
+                                       *UseMI))
           return;
 
         // %vgpr = COPY %sgpr0
@@ -1157,7 +1158,7 @@ void SIFoldOperandsImpl::foldOperand(
         UseMI->getOperand(1).setReg(OpToFold.getReg());
         UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
         UseMI->getOperand(1).setIsKill(false);
-        if (!ReadAnyLean)
+        if (NeedExec)
           UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
         return;
       }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ll
index 0da1f47d8fe1f3..2fdb684946b7fd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ll
@@ -362,96 +362,23 @@ define amdgpu_kernel void @test_readanylane_alloc() #1 {
   ret void
 }
 
-define void @test_readanylane_hoist(i1 %cond, i32 %src) #1 {
-; CHECK-SDAG-LABEL: test_readanylane_hoist:
-; CHECK-SDAG:       ; %bb.0:
-; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-SDAG-NEXT:    ; implicit-def: $sgpr0
-; CHECK-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; CHECK-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; CHECK-SDAG-NEXT:    s_xor_b32 s1, vcc_lo, -1
-; CHECK-SDAG-NEXT:    s_and_saveexec_b32 s2, s1
-; CHECK-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-SDAG-NEXT:    s_xor_b32 s1, exec_lo, s2
-; CHECK-SDAG-NEXT:    s_cbranch_execz .LBB14_2
-; CHECK-SDAG-NEXT:  ; %bb.1: ; %.else
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v1
-; CHECK-SDAG-NEXT:    ; implicit-def: $vgpr1
-; CHECK-SDAG-NEXT:  .LBB14_2: ; %Flow
-; CHECK-SDAG-NEXT:    s_or_saveexec_b32 s1, s1
-; CHECK-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s1
-; CHECK-SDAG-NEXT:    s_cbranch_execz .LBB14_4
-; CHECK-SDAG-NEXT:  ; %bb.3: ; %.then
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v1
-; CHECK-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT:  .LBB14_4: ; %.endif
-; CHECK-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; CHECK-SDAG-NEXT:    ;;#ASMSTART
-; CHECK-SDAG-NEXT:    ; use v0
-; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; CHECK-GISEL-LABEL: test_readanylane_hoist:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-GISEL-NEXT:    ; implicit-def: $sgpr0
-; CHECK-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; CHECK-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-GISEL-NEXT:    s_xor_b32 s1, vcc_lo, -1
-; CHECK-GISEL-NEXT:    s_and_saveexec_b32 s2, s1
-; CHECK-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-GISEL-NEXT:    s_xor_b32 s1, exec_lo, s2
-; CHECK-GISEL-NEXT:    s_cbranch_execz .LBB14_2
-; CHECK-GISEL-NEXT:  ; %bb.1: ; %.else
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s0, v1
-; CHECK-GISEL-NEXT:    ; implicit-def: $vgpr1
-; CHECK-GISEL-NEXT:  .LBB14_2: ; %Flow
-; CHECK-GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; CHECK-GISEL-NEXT:    s_cbranch_execz .LBB14_4
-; CHECK-GISEL-NEXT:  ; %bb.3: ; %.then
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s0, v1
-; CHECK-GISEL-NEXT:  .LBB14_4: ; %.endif
-; CHECK-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; CHECK-GISEL-NEXT:    ;;#ASMSTART
-; CHECK-GISEL-NEXT:    ; use s0
-; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  br i1 %cond, label %.then, label %.else
-.then:
-  %uni.then = call i32 @llvm.amdgcn.readanylane.i32(i32 %src)
-  br label %.endif
-.else:
-  %uni.else = call i32 @llvm.amdgcn.readanylane.i32(i32 %src)
-  br label %.endif
-.endif:
-  %readanylane = phi i32 [ %uni.then, %.then ], [ %uni.else, %.else ]
-  call void asm sideeffect "; use $0", "s"(i32 %readanylane)
-  ret void
-}
-
 define void @test_readanylane_suck(i1 %cond, i32 %src) #1 {
 ; CHECK-SDAG-LABEL: test_readanylane_suck:
 ; CHECK-SDAG:       ; %bb.0: ; %.entry
 ; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v1
 ; CHECK-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; CHECK-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-SDAG-NEXT:    s_xor_b32 s1, vcc_lo, -1
-; CHECK-SDAG-NEXT:    s_and_saveexec_b32 s0, s1
-; CHECK-SDAG-NEXT:    s_cbranch_execz .LBB15_2
+; CHECK-SDAG-NEXT:    s_xor_b32 s2, vcc_lo, -1
+; CHECK-SDAG-NEXT:    s_and_saveexec_b32 s1, s2
 ; CHECK-SDAG-NEXT:  ; %bb.1: ; %.else
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
-; CHECK-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; CHECK-SDAG-NEXT:    s_add_i32 s1, s1, 42
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s1
-; CHECK-SDAG-NEXT:  .LBB15_2: ; %.endif
-; CHECK-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; CHECK-SDAG-NEXT:    s_add_i32 s0, s0, 42
+; CHECK-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:  ; %bb.2: ; %.endif
+; CHECK-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    ; use v0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
@@ -461,18 +388,16 @@ define void @test_readanylane_suck(i1 %cond, i32 %src) #1 {
 ; CHECK-GISEL:       ; %bb.0: ; %.entry
 ; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
 ; CHECK-GISEL-NEXT:    s_mov_b32 s0, 0
 ; CHECK-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; CHECK-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-GISEL-NEXT:    s_xor_b32 s2, vcc_lo, -1
-; CHECK-GISEL-NEXT:    s_and_saveexec_b32 s1, s2
-; CHECK-GISEL-NEXT:    s_cbranch_execz .LBB15_2
+; CHECK-GISEL-NEXT:    s_xor_b32 s3, vcc_lo, -1
+; CHECK-GISEL-NEXT:    s_and_saveexec_b32 s2, s3
 ; CHECK-GISEL-NEXT:  ; %bb.1: ; %.else
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s0, v1
-; CHECK-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; CHECK-GISEL-NEXT:    s_add_i32 s0, s0, 42
-; CHECK-GISEL-NEXT:  .LBB15_2: ; %.endif
-; CHECK-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; CHECK-GISEL-NEXT:    s_add_i32 s0, s1, 42
+; CHECK-GISEL-NEXT:  ; %bb.2: ; %.endif
+; CHECK-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    ; use s0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND

>From 5a93fe10ffadcd75e8d6893940b39b33baa13cab Mon Sep 17 00:00:00 2001
From: Russell Liu <Xin.Liu2 at amd.com>
Date: Wed, 13 Nov 2024 20:04:34 +0800
Subject: [PATCH 3/4] [AMDGPU] Remove unused attribute for readanylane

 + Remove unused attribute `speculatable`.
 + Change machine instruction from SAlu to VAlu, add attributes.
 + Add testcase for InstCombine, and CSE.
 + Update docs.
---
 llvm/docs/AMDGPUUsage.rst                     |  3 +
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |  7 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  7 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  1 -
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp     |  7 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  9 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.readanylane.ll | 14 +--
 .../Transforms/EarlyCSE/AMDGPU/intrinsics.ll  | 24 ++++++
 .../InstCombine/AMDGPU/amdgcn-intrinsics.ll   | 86 +++++++++++++++++++
 9 files changed, 135 insertions(+), 23 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 5b83ea428c0bff..49e350d5ec1adb 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1204,6 +1204,9 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
                                                    for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>,
                                                    i64, double, pointers, multiples of the 32-bit vectors.
 
+  llvm.amdgcn.readanylane                          Similar to readfirstlane. But marks value that is uniform when used.
+                                                   The result is undefined if the value is actual divergent.
+
   llvm.amdgcn.readlane                             Provides direct access to v_readlane_b32. Returns the value in the
                                                    specified lane of the first input operand. The second operand specifies
                                                    the lane to read from. Currently implemented for i16, i32, float, half,
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 03ad739c8bc1f5..28b0d01463ed0d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2152,12 +2152,11 @@ def int_amdgcn_readfirstlane :
   Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
-// This is similar to readfirstlane, but marks value that is uniform, allowed sunk into
-// control flow. The result is undefined if the value is actual divergent.
+// This is similar to readfirstlane, but marks value that is uniform when used, allowed sunk
+// into control flow. The result is undefined if the value is actual divergent.
 def int_amdgcn_readanylane :
   Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
-            [IntrConvergent, IntrNoCallback, IntrNoFree, IntrNoMem, IntrSpeculatable,
-             IntrWillReturn]>;
+            [IntrConvergent, IntrNoCallback, IntrNoFree, IntrNoMem, IntrWillReturn]>;
 
 // The lane argument must be uniform across the currently active threads of the
 // current wave. Otherwise, the result is undefined.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index a5e984bde0e6c4..db84a46e5ffd98 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -98,11 +98,9 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg,
 
 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
                                                         unsigned NewOpc) const {
-  const bool NeedExec = NewOpc != AMDGPU::SI_READANYLANE;
   MI.setDesc(TII.get(NewOpc));
   MI.removeOperand(1); // Remove intrinsic ID.
-  if (NeedExec)
-    MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
+  MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
 
   MachineOperand &Dst = MI.getOperand(0);
   MachineOperand &Src = MI.getOperand(1);
@@ -115,7 +113,8 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
   const TargetRegisterClass *SrcRC
     = TRI.getConstrainedRegClassForOperand(Src, *MRI);
-  if (!DstRC || (NeedExec && DstRC != SrcRC))
+  // READANYLANE allows input is vgpr and output is sgpr.
+  if (!DstRC || (NewOpc != AMDGPU::SI_READANYLANE && DstRC != SrcRC))
     return false;
 
   return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 1728876eafffcc..2ffad18f35e0f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4659,7 +4659,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       [[fallthrough]];
     }
     case Intrinsic::amdgcn_readanylane:
-      [[fallthrough]];
     case Intrinsic::amdgcn_readfirstlane: {
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 2121548c4ffe42..9b7644e94db371 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1120,7 +1120,6 @@ void SIFoldOperandsImpl::foldOperand(
         (UseOpc == AMDGPU::V_READLANE_B32 &&
          (int)UseOpIdx ==
          AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
-      const bool NeedExec = UseOpc != AMDGPU::SI_READANYLANE;
       // %vgpr = V_MOV_B32 imm
       // %sgpr = V_READFIRSTLANE_B32 %vgpr
       // =>
@@ -1138,8 +1137,7 @@ void SIFoldOperandsImpl::foldOperand(
           UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
         else
           UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
-        if (NeedExec)
-          UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
+        UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
         return;
       }
 
@@ -1158,8 +1156,7 @@ void SIFoldOperandsImpl::foldOperand(
         UseMI->getOperand(1).setReg(OpToFold.getReg());
         UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
         UseMI->getOperand(1).setIsKill(false);
-        if (NeedExec)
-          UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
+        UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
         return;
       }
     }
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 575fac67288e01..ad6e64c0d7c8ae 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -546,8 +546,13 @@ def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
   let maybeAtomic = 0;
 }
 
-def SI_READANYLANE : SPseudoInstSI <(outs SReg_32:$dst), (ins VGPR_32:$src)> {
-  let SALU = 1;
+def SI_READANYLANE : VPseudoInstSI <(outs SReg_32:$dst), (ins VGPR_32:$src)> {
+  let Uses = [EXEC];
+  let VALU = 1;
+  let hasSideEffects = 0;
+  let isConvergent = 1;
+  let mayLoad = 0;
+  let mayStore = 0;
 }
 
 // Used as an isel pseudo to directly emit initialization with an
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ll
index 2fdb684946b7fd..3b8232e2e7d826 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readanylane.ll
@@ -7,7 +7,7 @@ define void @test_readanylane_i1(ptr addrspace(1) %out, i1 %src) #1 {
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
-; CHECK-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; CHECK-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; CHECK-SDAG-NEXT:    s_and_b32 s0, s0, 1
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    global_store_b8 v[0:1], v2, off
@@ -17,7 +17,7 @@ define void @test_readanylane_i1(ptr addrspace(1) %out, i1 %src) #1 {
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
-; CHECK-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; CHECK-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; CHECK-GISEL-NEXT:    s_and_b32 s0, s0, 1
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    global_store_b8 v[0:1], v2, off
@@ -56,7 +56,7 @@ define void @test_readanylane_i1_select(ptr addrspace(1) %out, i32 %src, i32 %sr
 ; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 42, v2
 ; CHECK-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; CHECK-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; CHECK-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
 ; CHECK-SDAG-NEXT:    s_bitcmp1_b32 s0, 0
 ; CHECK-SDAG-NEXT:    s_cselect_b32 vcc_lo, -1, 0
@@ -69,7 +69,7 @@ define void @test_readanylane_i1_select(ptr addrspace(1) %out, i32 %src, i32 %sr
 ; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 42, v2
 ; CHECK-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; CHECK-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; CHECK-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s0, v4
 ; CHECK-GISEL-NEXT:    s_and_b32 s0, 1, s0
 ; CHECK-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -89,7 +89,7 @@ define void @test_readanylane_i16(i16 %src) #1 {
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; CHECK-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; CHECK-SDAG-NEXT:    s_and_b32 s0, s0, 0xffff
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    ; use s0
@@ -368,7 +368,7 @@ define void @test_readanylane_suck(i1 %cond, i32 %src) #1 {
 ; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v1
-; CHECK-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; CHECK-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; CHECK-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-SDAG-NEXT:    s_xor_b32 s2, vcc_lo, -1
@@ -390,7 +390,7 @@ define void @test_readanylane_suck(i1 %cond, i32 %src) #1 {
 ; CHECK-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
 ; CHECK-GISEL-NEXT:    s_mov_b32 s0, 0
-; CHECK-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; CHECK-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; CHECK-GISEL-NEXT:    s_xor_b32 s3, vcc_lo, -1
 ; CHECK-GISEL-NEXT:    s_and_saveexec_b32 s2, s3
diff --git a/llvm/test/Transforms/EarlyCSE/AMDGPU/intrinsics.ll b/llvm/test/Transforms/EarlyCSE/AMDGPU/intrinsics.ll
index 08a13856c81b6d..a6b17cb56b3b2e 100644
--- a/llvm/test/Transforms/EarlyCSE/AMDGPU/intrinsics.ll
+++ b/llvm/test/Transforms/EarlyCSE/AMDGPU/intrinsics.ll
@@ -33,4 +33,28 @@ define void @cse_nonzero_offset(ptr addrspace(1) %out, <4 x i32> %in) {
   ret void
 }
 
+define i32 @readanylane_readanylane_divergent_block(i1 %cond, i32 %arg) {
+; CHECK-LABEL: define i32 @readanylane_readanylane_divergent_block(
+; CHECK-SAME: i1 [[COND:%.*]], i32 [[ARG:%.*]]) {
+; CHECK-NEXT:  [[_ENTRY:.*:]]
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readanylane.i32(i32 [[ARG]])
+; CHECK-NEXT:    br i1 [[COND]], [[DOTTHEN:label %.*]], [[DOTEXIT:label %.*]]
+; CHECK:       [[_THEN:.*:]]
+; CHECK-NEXT:    br [[DOTEXIT]]
+; CHECK:       [[_EXIT:.*:]]
+; CHECK-NEXT:    ret i32 [[READ0]]
+;
+.entry:
+  %read0 = call i32 @llvm.amdgcn.readanylane.i32(i32 %arg)
+  br i1 %cond, label %.then, label %.exit
+
+.then:
+  %read1 = call i32 @llvm.amdgcn.readanylane.i32(i32 %arg)
+  br label %.exit
+
+.exit:
+  %result = phi i32 [ %read0, %.entry ], [ %read1, %.then ]
+  ret i32 %result
+}
+
 declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> nocapture, i32, i32)
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index 779def76fc58d3..771326177297a1 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -2925,6 +2925,92 @@ bb1:
   ret i32 %read1
 }
 
+; --------------------------------------------------------------------
+; llvm.amdgcn.readanylane
+; --------------------------------------------------------------------
+
+declare i32 @llvm.amdgcn.readanylane(i32)
+
+define amdgpu_kernel void @readanylane_constant(i32 %arg) {
+; CHECK-LABEL: @readanylane_constant(
+; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readanylane.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT:    store volatile i32 [[VAR]], ptr undef, align 4
+; CHECK-NEXT:    store volatile i32 0, ptr undef, align 4
+; CHECK-NEXT:    store volatile i32 123, ptr undef, align 4
+; CHECK-NEXT:    store volatile i32 ptrtoint (ptr @gv to i32), ptr undef, align 4
+; CHECK-NEXT:    store volatile i32 undef, ptr undef, align 4
+; CHECK-NEXT:    ret void
+;
+  %var = call i32 @llvm.amdgcn.readanylane(i32 %arg)
+  %zero = call i32 @llvm.amdgcn.readanylane(i32 0)
+  %imm = call i32 @llvm.amdgcn.readanylane(i32 123)
+  %constexpr = call i32 @llvm.amdgcn.readanylane(i32 ptrtoint (ptr @gv to i32))
+  %undef = call i32 @llvm.amdgcn.readanylane(i32 undef)
+  store volatile i32 %var, ptr undef
+  store volatile i32 %zero, ptr undef
+  store volatile i32 %imm, ptr undef
+  store volatile i32 %constexpr, ptr undef
+  store volatile i32 %undef, ptr undef
+  ret void
+}
+
+define i32 @readanylane_idempotent(i32 %arg) {
+; CHECK-LABEL: @readanylane_idempotent(
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readanylane.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT:    ret i32 [[READ0]]
+;
+  %read0 = call i32 @llvm.amdgcn.readanylane(i32 %arg)
+  %read1 = call i32 @llvm.amdgcn.readanylane(i32 %read0)
+  %read2 = call i32 @llvm.amdgcn.readanylane(i32 %read1)
+  ret i32 %read2
+}
+
+define i32 @readanylane_readlane(i32 %arg) {
+; CHECK-LABEL: @readanylane_readlane(
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readanylane.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT:    ret i32 [[READ0]]
+;
+  %read0 = call i32 @llvm.amdgcn.readanylane(i32 %arg)
+  %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 0)
+  ret i32 %read1
+}
+
+define i32 @readanylane_readanylane_different_block(i32 %arg) {
+; CHECK-LABEL: @readanylane_readanylane_different_block(
+; CHECK-NEXT:  bb0:
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readanylane.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readanylane.i32(i32 [[READ0]])
+; CHECK-NEXT:    ret i32 [[READ1]]
+;
+bb0:
+  %read0 = call i32 @llvm.amdgcn.readanylane(i32 %arg)
+  br label %bb1
+
+bb1:
+  %read1 = call i32 @llvm.amdgcn.readanylane(i32 %read0)
+  ret i32 %read1
+}
+
+define i32 @readanylane_readlane_different_block(i32 %arg) {
+; CHECK-LABEL: @readanylane_readlane_different_block(
+; CHECK-NEXT:  bb0:
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 0)
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readanylane.i32(i32 [[READ0]])
+; CHECK-NEXT:    ret i32 [[READ1]]
+;
+bb0:
+  %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 0)
+  br label %bb1
+
+bb1:
+  %read1 = call i32 @llvm.amdgcn.readanylane(i32 %read0)
+  ret i32 %read1
+}
+
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.readlane
 ; --------------------------------------------------------------------

>From ac399310c3d2d52981b627f52e5db8e86d725596 Mon Sep 17 00:00:00 2001
From: Russell Liu <Xin.Liu2 at amd.com>
Date: Thu, 14 Nov 2024 18:40:07 +0800
Subject: [PATCH 4/4] [AMDGPU] Update testcase for readanylane

Incorrectly added FileCheck pattern for readanylane
---
 llvm/test/Transforms/EarlyCSE/AMDGPU/intrinsics.ll | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/test/Transforms/EarlyCSE/AMDGPU/intrinsics.ll b/llvm/test/Transforms/EarlyCSE/AMDGPU/intrinsics.ll
index a6b17cb56b3b2e..867e4ca3f8d903 100644
--- a/llvm/test/Transforms/EarlyCSE/AMDGPU/intrinsics.ll
+++ b/llvm/test/Transforms/EarlyCSE/AMDGPU/intrinsics.ll
@@ -38,11 +38,13 @@ define i32 @readanylane_readanylane_divergent_block(i1 %cond, i32 %arg) {
 ; CHECK-SAME: i1 [[COND:%.*]], i32 [[ARG:%.*]]) {
 ; CHECK-NEXT:  [[_ENTRY:.*:]]
 ; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readanylane.i32(i32 [[ARG]])
-; CHECK-NEXT:    br i1 [[COND]], [[DOTTHEN:label %.*]], [[DOTEXIT:label %.*]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[DOTTHEN:.*]], [[DOTEXIT:label %.*]]
 ; CHECK:       [[_THEN:.*:]]
+; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readanylane.i32(i32 [[ARG]])
 ; CHECK-NEXT:    br [[DOTEXIT]]
 ; CHECK:       [[_EXIT:.*:]]
-; CHECK-NEXT:    ret i32 [[READ0]]
+; CHECK-NEXT:    [[RESULT:%.*]] = phi i32 [ [[READ0]], [[DOTENTRY:%.*]] ], [ [[READ1]], %[[DOTTHEN]] ]
+; CHECK-NEXT:    ret i32 [[RESULT]]
 ;
 .entry:
   %read0 = call i32 @llvm.amdgcn.readanylane.i32(i32 %arg)



More information about the llvm-commits mailing list