[clang] 5feb32b - [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (#89217)
via cfe-commits
cfe-commits at lists.llvm.org
Tue Jun 25 02:05:23 PDT 2024
Author: Vikram Hegde
Date: 2024-06-25T14:35:19+05:30
New Revision: 5feb32ba929f9e517c530217cabb09d1d734a763
URL: https://github.com/llvm/llvm-project/commit/5feb32ba929f9e517c530217cabb09d1d734a763
DIFF: https://github.com/llvm/llvm-project/commit/5feb32ba929f9e517c530217cabb09d1d734a763.diff
LOG: [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (#89217)
This patch is intended to be the first of a series with end goal to
adapt atomic optimizer pass to support i64 and f64 operations (along
with removing all unnecessary bitcasts). This legalizes 64 bit readlane,
writelane and readfirstlane ops pre-ISel
---------
Co-authored-by: vikramRH <vikhegde at amd.com>
Added:
llvm/test/CodeGen/AMDGPU/convergence-laneops.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
Modified:
clang/lib/CodeGen/CGBuiltin.cpp
clang/test/CodeGenOpenCL/builtins-amdgcn.cl
llvm/docs/AMDGPUUsage.rst
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/lib/Target/AMDGPU/VOP1Instructions.td
llvm/lib/Target/AMDGPU/VOP2Instructions.td
llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll
llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll
llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll
llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
Removed:
################################################################################
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 77fd711e99dba..ce22e13d0004f 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18467,6 +18467,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
return Builder.CreateCall(F, Args);
}
+ case AMDGPU::BI__builtin_amdgcn_readlane:
+ return emitBuiltinWithOneOverloadedType<2>(*this, E,
+ Intrinsic::amdgcn_readlane);
+ case AMDGPU::BI__builtin_amdgcn_readfirstlane:
+ return emitBuiltinWithOneOverloadedType<1>(*this, E,
+ Intrinsic::amdgcn_readfirstlane);
case AMDGPU::BI__builtin_amdgcn_div_fixup:
case AMDGPU::BI__builtin_amdgcn_div_fixupf:
case AMDGPU::BI__builtin_amdgcn_div_fixuph:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 95daa2cdbc92c..6a6d5b1dfed3d 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -308,14 +308,14 @@ void test_ds_bpermute(global int* out, int a, int b)
}
// CHECK-LABEL: @test_readfirstlane
-// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readfirstlane(i32 %a)
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readfirstlane.i32(i32 %a)
void test_readfirstlane(global int* out, int a)
{
*out = __builtin_amdgcn_readfirstlane(a);
}
// CHECK-LABEL: @test_readlane
-// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readlane(i32 %a, i32 %b)
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b)
void test_readlane(global int* out, int a, int b)
{
*out = __builtin_amdgcn_readlane(a, b);
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 5a16457412d24..192df32229787 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1188,6 +1188,23 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
:ref:`llvm.set.fpenv<int_set_fpenv>` Sets the floating point environment to the specifies state.
+ llvm.amdgcn.readfirstlane Provides direct access to v_readfirstlane_b32. Returns the value in
+ the lowest active lane of the input operand. Currently implemented
+ for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>,
+ i64, double, pointers, multiples of the 32-bit vectors.
+
+ llvm.amdgcn.readlane Provides direct access to v_readlane_b32. Returns the value in the
+ specified lane of the first input operand. The second operand specifies
+ the lane to read from. Currently implemented for i16, i32, float, half,
+ bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers,
+ multiples of the 32-bit vectors.
+
+ llvm.amdgcn.writelane Provides direct access to v_writelane_b32. Writes value in the first input
+ operand to the specified lane of divergent output. The second operand
+ specifies the lane to write. Currently implemented for i16, i32, float,
+ half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers,
+ multiples of the 32-bit vectors.
+
llvm.amdgcn.wave.reduce.umin Performs an arithmetic unsigned min reduction on the unsigned values
provided by each lane in the wavefront.
Intrinsic takes a hint for reduction strategy using second operand
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 7a5e919fe26e3..11662ccc1a695 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2042,26 +2042,23 @@ def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
def int_amdgcn_readfirstlane :
- ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
- Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
+ Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
// The lane argument must be uniform across the currently active threads of the
// current wave. Otherwise, the result is undefined.
def int_amdgcn_readlane :
- ClangBuiltin<"__builtin_amdgcn_readlane">,
- Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+ Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
// The value to write and lane select arguments must be uniform across the
// currently active threads of the current wave. Otherwise, the result is
// undefined.
def int_amdgcn_writelane :
- ClangBuiltin<"__builtin_amdgcn_writelane">,
- Intrinsic<[llvm_i32_ty], [
- llvm_i32_ty, // uniform value to write: returned by the selected lane
- llvm_i32_ty, // uniform lane select
- llvm_i32_ty // returned by all lanes other than the selected one
+ Intrinsic<[llvm_any_ty], [
+ LLVMMatchType<0>, // uniform value to write: returned by the selected lane
+ llvm_i32_ty, // uniform lane select
+ LLVMMatchType<0> // returned by all lanes other than the selected one
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 38cc5a9bef969..0fdf3e468a71b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -424,7 +424,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
// combine them with a scalar operation.
Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
V = B.CreateBitCast(V, IntNTy);
Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
@@ -514,10 +514,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
B.getInt32(0xf), B.getFalse()});
} else {
- Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
- Function *WriteLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+ Function *ReadLane = Intrinsic::getDeclaration(
+ M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
+ Function *WriteLane = Intrinsic::getDeclaration(
+ M, Intrinsic::amdgcn_writelane, B.getInt32Ty());
// On GFX10 all DPP operations are confined to a single row. To get cross-
// row operations we have to use permlane or readlane.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e7251a24b29fa..4b48091b7143e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5430,6 +5430,98 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
return true;
}
+// TODO: Fix pointer type handling
+bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
+ MachineInstr &MI,
+ Intrinsic::ID IID) const {
+
+ MachineIRBuilder &B = Helper.MIRBuilder;
+ MachineRegisterInfo &MRI = *B.getMRI();
+
+ auto createLaneOp = [&IID, &B](Register Src0, Register Src1, Register Src2,
+ LLT VT) -> Register {
+ auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
+ switch (IID) {
+ case Intrinsic::amdgcn_readfirstlane:
+ return LaneOp.getReg(0);
+ case Intrinsic::amdgcn_readlane:
+ return LaneOp.addUse(Src1).getReg(0);
+ case Intrinsic::amdgcn_writelane:
+ return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
+ default:
+ llvm_unreachable("unhandled lane op");
+ }
+ };
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register Src0 = MI.getOperand(2).getReg();
+ Register Src1, Src2;
+ if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
+ Src1 = MI.getOperand(3).getReg();
+ if (IID == Intrinsic::amdgcn_writelane) {
+ Src2 = MI.getOperand(4).getReg();
+ }
+ }
+
+ LLT Ty = MRI.getType(DstReg);
+ unsigned Size = Ty.getSizeInBits();
+
+ if (Size == 32) {
+ // Already legal
+ return true;
+ }
+
+ if (Size < 32) {
+ Src0 = B.buildAnyExt(S32, Src0).getReg(0);
+ if (Src2.isValid())
+ Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
+
+ Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
+ B.buildTrunc(DstReg, LaneOpDst);
+
+ MI.eraseFromParent();
+ return true;
+ }
+
+ if (Size % 32 != 0)
+ return false;
+
+ LLT PartialResTy = S32;
+ if (Ty.isVector()) {
+ LLT EltTy = Ty.getElementType();
+ switch (EltTy.getSizeInBits()) {
+ case 16:
+ PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
+ break;
+ case 32:
+ PartialResTy = EltTy;
+ break;
+ default:
+ // Handle all other cases via S32 pieces;
+ break;
+ }
+ }
+
+ SmallVector<Register, 2> PartialRes;
+ unsigned NumParts = Size / 32;
+ MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
+ MachineInstrBuilder Src2Parts;
+
+ if (Src2.isValid())
+ Src2Parts = B.buildUnmerge(PartialResTy, Src2);
+
+ for (unsigned i = 0; i < NumParts; ++i) {
+ Src0 = Src0Parts.getReg(i);
+ if (Src2.isValid())
+ Src2 = Src2Parts.getReg(i);
+ PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
+ }
+
+ B.buildMergeLikeInstr(DstReg, PartialRes);
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
@@ -7370,6 +7462,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
Observer.changedInstr(MI);
return true;
}
+ case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_writelane:
+ case Intrinsic::amdgcn_readfirstlane:
+ return legalizeLaneOp(Helper, MI, IntrID);
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 4b1d821dadc21..ae01bb29c1108 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -210,6 +210,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
Intrinsic::ID IID) const;
+ bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI,
+ Intrinsic::ID IID) const;
+
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4f8882ed1cc96..ac4cbd617b81d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6098,6 +6098,157 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
}
+static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
+ SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ unsigned ValSize = VT.getSizeInBits();
+ unsigned IID = N->getConstantOperandVal(0);
+ SDLoc SL(N);
+ MVT IntVT = MVT::getIntegerVT(ValSize);
+
+ auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
+ SDValue Src2, MVT ValT) -> SDValue {
+ SmallVector<SDValue, 8> Operands;
+ Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
+ switch (IID) {
+ case Intrinsic::amdgcn_readfirstlane:
+ Operands.push_back(Src0);
+ break;
+ case Intrinsic::amdgcn_readlane:
+ Operands.push_back(Src0);
+ Operands.push_back(Src1);
+ break;
+ case Intrinsic::amdgcn_writelane:
+ Operands.push_back(Src0);
+ Operands.push_back(Src1);
+ Operands.push_back(Src2);
+ break;
+ }
+
+ if (SDNode *GL = N->getGluedNode()) {
+ assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
+ GL = GL->getOperand(0).getNode();
+ Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
+ SDValue(GL, 0)));
+ }
+
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
+ };
+
+ SDValue Src0 = N->getOperand(1);
+ SDValue Src1, Src2;
+ if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
+ Src1 = N->getOperand(2);
+ if (IID == Intrinsic::amdgcn_writelane)
+ Src2 = N->getOperand(3);
+ }
+
+ if (ValSize == 32) {
+ // Already legal
+ return SDValue();
+ }
+
+ if (ValSize < 32) {
+ bool IsFloat = VT.isFloatingPoint();
+ Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
+ SL, MVT::i32);
+ if (Src2.getNode()) {
+ Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
+ SL, MVT::i32);
+ }
+ SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
+ SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
+ return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
+ }
+
+ if (ValSize % 32 != 0)
+ return SDValue();
+
+ auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
+ EVT VT = N->getValueType(0);
+ unsigned NE = VT.getVectorNumElements();
+ EVT EltVT = VT.getVectorElementType();
+ SmallVector<SDValue, 8> Scalars;
+ unsigned NumOperands = N->getNumOperands();
+ SmallVector<SDValue, 4> Operands(NumOperands);
+ SDNode *GL = N->getGluedNode();
+
+ // only handle convergencectrl_glue
+ assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
+
+ for (unsigned i = 0; i != NE; ++i) {
+ for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
+ ++j) {
+ SDValue Operand = N->getOperand(j);
+ EVT OperandVT = Operand.getValueType();
+ if (OperandVT.isVector()) {
+ // A vector operand; extract a single element.
+ EVT OperandEltVT = OperandVT.getVectorElementType();
+ Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
+ Operand, DAG.getVectorIdxConstant(i, SL));
+ } else {
+ // A scalar operand; just use it as is.
+ Operands[j] = Operand;
+ }
+ }
+
+ if (GL)
+ Operands[NumOperands - 1] =
+ DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
+ SDValue(GL->getOperand(0).getNode(), 0));
+
+ Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
+ }
+
+ EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
+ return DAG.getBuildVector(VecVT, SL, Scalars);
+ };
+
+ if (VT.isVector()) {
+ switch (MVT::SimpleValueType EltTy =
+ VT.getVectorElementType().getSimpleVT().SimpleTy) {
+ case MVT::i32:
+ case MVT::f32: {
+ SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
+ return unrollLaneOp(LaneOp.getNode());
+ }
+ case MVT::i16:
+ case MVT::f16:
+ case MVT::bf16: {
+ MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
+ SmallVector<SDValue, 4> Pieces;
+ for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
+ SDValue Src0SubVec =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
+ DAG.getConstant(EltIdx, SL, MVT::i32));
+
+ SDValue Src2SubVec;
+ if (Src2)
+ Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
+ DAG.getConstant(EltIdx, SL, MVT::i32));
+
+ Pieces.push_back(createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
+ EltIdx += 2;
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
+ }
+ default:
+ // Handle all other cases by bitcasting to i32 vectors
+ break;
+ }
+ }
+
+ MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
+ Src0 = DAG.getBitcast(VecVT, Src0);
+
+ if (Src2)
+ Src2 = DAG.getBitcast(VecVT, Src2);
+
+ SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
+ SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
+ return DAG.getBitcast(VT, UnrolledLaneOp);
+}
+
void SITargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
@@ -8564,6 +8715,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
case Intrinsic::amdgcn_addrspacecast_nonnull:
return lowerADDRSPACECAST(Op, DAG);
+ case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_writelane:
+ return lowerLaneOp(*this, Op.getNode(), DAG);
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 531b23d3877cb..835f44f9d0d61 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3389,7 +3389,7 @@ def : GCNPat<
// FIXME: Should also do this for readlane, but tablegen crashes on
// the ignored src1.
def : GCNPat<
- (int_amdgcn_readfirstlane (i32 imm:$src)),
+ (i32 (int_amdgcn_readfirstlane (i32 imm:$src))),
(S_MOV_B32 SReg_32:$src)
>;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index d39b1a47f0526..0794ba2f66f40 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -112,7 +112,7 @@ class getVOP1Pat <SDPatternOperator node, VOPProfile P> : LetDummies {
!if(P.HasOMod,
[(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0,
i1:$clamp, i32:$omod))))],
- [(set P.DstVT:$vdst, (node P.Src0RC32:$src0))]
+ [(set P.DstVT:$vdst, (node (P.Src0VT P.Src0RC32:$src0)))]
)
);
}
@@ -249,12 +249,17 @@ def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
// FIXME: Specify SchedRW for READFIRSTLANE_B32
// TODO: There is VOP3 encoding also
def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE,
- getVOP1Pat<int_amdgcn_readfirstlane,
- VOP_READFIRSTLANE>.ret, 1> {
+ [], 1> {
let isConvergent = 1;
let IsInvalidSingleUseConsumer = 1;
}
+foreach vt = Reg32Types.types in {
+ def : GCNPat<(vt (int_amdgcn_readfirstlane (vt VRegOrLdsSrc_32:$src0))),
+ (V_READFIRSTLANE_B32 (vt VRegOrLdsSrc_32:$src0))
+ >;
+}
+
let isReMaterializable = 1 in {
let SchedRW = [WriteDoubleCvt] in {
// OMod clears exceptions when set in this instruction
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index bed8d94a26dcb..9989752c2f6bc 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -780,16 +780,24 @@ defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag,
// These are special and do not read the exec mask.
let isConvergent = 1, Uses = []<Register>, IsInvalidSingleUseConsumer = 1 in {
-def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
- [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))]>;
+def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, []>;
let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
-def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
- [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]> {
+def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []> {
let IsInvalidSingleUseProducer = 1;
}
} // End IsNeverUniform, $vdst = $vdst_in, DisableEncoding $vdst_in
} // End isConvergent = 1
+foreach vt = Reg32Types.types in {
+ def : GCNPat<(vt (int_amdgcn_readlane vt:$src0, i32:$src1)),
+ (V_READLANE_B32 VRegOrLdsSrc_32:$src0, SCSrc_b32:$src1)
+ >;
+
+ def : GCNPat<(vt (int_amdgcn_writelane vt:$src0, i32:$src1, vt:$src2)),
+ (V_WRITELANE_B32 SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$src2)
+ >;
+}
+
let isReMaterializable = 1 in {
defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>;
defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32, add_ctpop>;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 26c85e83b53ad..74d2f53d7b317 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -56,9 +56,9 @@ define amdgpu_kernel void @mov_dpp8(ptr addrspace(1) %out, i32 %in) #0 {
ret void
}
-; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.writelane(i32 0, i32 1, i32 2)
+; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.writelane.i32(i32 0, i32 1, i32 2)
define amdgpu_kernel void @writelane(ptr addrspace(1) %out) #0 {
- %tmp0 = call i32 @llvm.amdgcn.writelane(i32 0, i32 1, i32 2)
+ %tmp0 = call i32 @llvm.amdgcn.writelane.i32(i32 0, i32 1, i32 2)
store i32 %tmp0, ptr addrspace(1) %out
ret void
}
@@ -237,7 +237,7 @@ declare i32 @llvm.amdgcn.permlanex16.var(i32, i32, i32, i1, i1) #1
declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1
declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1
-declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #1
+declare i32 @llvm.amdgcn.writelane.i32(i32, i32, i32) #1
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>) #1
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>) #1
declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
index 220dc70165e87..bdfafa89cd047 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
@@ -1,5 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
@@ -74,7 +73,7 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
; IR-NEXT: br label [[TMP11]]
; IR: 11:
; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
; IR-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]]
; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0)
; IR-NEXT: ret void
@@ -172,7 +171,7 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
; IR-NEXT: br label [[TMP11]]
; IR: 11:
; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
; IR-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]]
; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0)
; IR-NEXT: ret void
@@ -273,7 +272,7 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = and i32 [[TMP5]], 1
; IR-NEXT: [[TMP16:%.*]] = xor i32 [[TMP14]], [[TMP15]]
; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP16]], i32 0, i32 0, i32 0)
@@ -374,7 +373,7 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
; IR-NEXT: br label [[TMP11]]
; IR: 11:
; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
; IR-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]]
; IR-NEXT: [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128
; IR-NEXT: [[ARG_VEC:%.*]] = bitcast i128 [[ARG_INT]] to <4 x i32>
@@ -476,7 +475,7 @@ define amdgpu_cs void @atomic_ptr_sub_and_format(ptr addrspace(8) inreg %arg) {
; IR-NEXT: br label [[TMP11]]
; IR: 11:
; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
; IR-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]]
; IR-NEXT: [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128
; IR-NEXT: [[ARG_VEC:%.*]] = bitcast i128 [[ARG_INT]] to <4 x i32>
@@ -581,7 +580,7 @@ define amdgpu_cs void @atomic_ptr_xor_and_format(ptr addrspace(8) inreg %arg) {
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = and i32 [[TMP5]], 1
; IR-NEXT: [[TMP16:%.*]] = xor i32 [[TMP14]], [[TMP15]]
; IR-NEXT: [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll
index c07cd4e493b9a..019f76aa44a87 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll
@@ -48,7 +48,7 @@ define amdgpu_kernel void @ham(ptr addrspace(4) %arg) {
; CHECK-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP6]], [[BB7]] ], [ [[TMP16:%.*]], [[COMPUTELOOP]] ]
; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
-; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[PHI]], i32 [[TMP11]])
+; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[PHI]], i32 [[TMP11]])
; CHECK-NEXT: [[TMP13]] = add i32 [[ACCUMULATOR]], [[TMP12]]
; CHECK-NEXT: [[TMP14:%.*]] = shl i64 1, [[TMP10]]
; CHECK-NEXT: [[TMP15:%.*]] = xor i64 [[TMP14]], -1
diff --git a/llvm/test/CodeGen/AMDGPU/convergence-laneops.ll b/llvm/test/CodeGen/AMDGPU/convergence-laneops.ll
new file mode 100644
index 0000000000000..b7a1749be18bc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/convergence-laneops.ll
@@ -0,0 +1,70 @@
+; RUN: llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s
+; RUN: not --crash llc -mtriple=amdgcn--amdhsa -mcpu=1100 -verify-machineinstrs < %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+
+; FIXME: Merge these tests with existing lane op tests (llvm.amdgcn.readlane.ll, llvm.amdgcn.writelane.ll ...) once the crash is fixed.
+
+; CHECK-LABEL: name: basic_readfirstlane_i64
+; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
+; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]]
+; ISEL: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]]
+; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]]
+; ISEL: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]]
+define i64 @basic_readfirstlane_i64(i64 %src, i1 %cond) #0 {
+entry:
+ %t = call token @llvm.experimental.convergence.anchor()
+ %x = add i64 %src, 1
+ br i1 %cond, label %then, label %else
+
+then:
+; CHECK-ERROR: Cannot mix controlled and uncontrolled convergence in the same function.
+; CHECK-ERROR: V_READFIRSTLANE_B32
+ %r = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %x) [ "convergencectrl"(token %t) ]
+ br label %else
+
+else:
+ %p = phi i64 [%r, %then], [%x, %entry]
+ ret i64 %p
+}
+
+; CHECK-LABEL: name: basic_readlane_i64
+; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
+; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]]
+; ISEL: {{.*}} = V_READLANE_B32 {{.*}}, implicit [[TOKEN]]
+; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]]
+; ISEL: {{.*}} = V_READLANE_B32 {{.*}}, implicit [[TOKEN]]
+define i64 @basic_readlane_i64(i64 %src, i32 %lane, i1 %cond) #0 {
+entry:
+ %t = call token @llvm.experimental.convergence.anchor()
+ %x = add i64 %src, 1
+ br i1 %cond, label %then, label %else
+
+then:
+ %r = call i64 @llvm.amdgcn.readlane.i64(i64 %x, i32 %lane) [ "convergencectrl"(token %t) ]
+ br label %else
+
+else:
+ %p = phi i64 [%r, %then], [%x, %entry]
+ ret i64 %p
+}
+
+; CHECK-LABEL: name: basic_writelane_i64
+; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
+; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]]
+; ISEL: {{.*}} = V_WRITELANE_B32 {{.*}}, implicit [[TOKEN]]
+; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]]
+; ISEL: {{.*}} = V_WRITELANE_B32 {{.*}}, implicit [[TOKEN]]
+define i64 @basic_writelane_i64(i64 %src, i1 %cond, i32 %lane, ptr addrspace(1) %out) #0 {
+entry:
+ %old = load i64, ptr addrspace(1) %out
+ %t = call token @llvm.experimental.convergence.anchor()
+ %x = add i64 %src, 1
+ br i1 %cond, label %then, label %else
+
+then:
+ %r = call i64 @llvm.amdgcn.writelane.i64(i64 %x, i32 %lane, i64 %old) [ "convergencectrl"(token %t) ]
+ br label %else
+
+else:
+ %p = phi i64 [%r, %then], [%x, %entry]
+ ret i64 %p
+}
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll
index 6b47f81bccb71..6c61c837881c4 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll
@@ -130,7 +130,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -193,7 +193,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -251,7 +251,7 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspa
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -310,7 +310,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -364,7 +364,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -421,7 +421,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -473,7 +473,7 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspa
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -526,7 +526,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -586,7 +586,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -649,7 +649,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -707,7 +707,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspa
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -766,7 +766,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -820,7 +820,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -878,7 +878,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -931,7 +931,7 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspa
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -985,7 +985,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1040,7 +1040,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1098,7 +1098,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1151,7 +1151,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrsp
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1205,7 +1205,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1260,7 +1260,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1318,7 +1318,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1371,7 +1371,7 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspa
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1425,7 +1425,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
index 03434caee2331..722c069f90a8c 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
@@ -29,7 +29,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_agent_scope_uns
; IR: 16:
; IR-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
; IR-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]])
+; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]])
; IR-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
; IR-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float
; IR-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]]
@@ -62,7 +62,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]])
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = fadd float [[TMP16]], [[TMP28:%.*]]
; IR-ITERATIVE-NEXT: br label [[TMP18]]
@@ -76,11 +76,11 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]])
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]])
; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]])
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]])
; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float
; IR-ITERATIVE-NEXT: [[TMP29]] = fadd float [[ACCUMULATOR]], [[TMP24]]
; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -120,7 +120,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco
; IR-DPP-NEXT: [[TMP24:%.*]] = fadd float [[TMP22]], [[TMP23]]
; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false)
; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63)
+; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63)
; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]])
; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -131,7 +131,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco
; IR-DPP: 33:
; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]])
+; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]])
; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]])
; IR-DPP-NEXT: [[TMP39:%.*]] = fadd float [[TMP37]], [[TMP38]]
@@ -167,7 +167,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_un
; IR-ITERATIVE: 16:
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
@@ -199,7 +199,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_un
; IR-DPP: 16:
; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -232,7 +232,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: br label [[TMP18]]
@@ -246,11 +246,11 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float
; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -290,7 +290,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -301,7 +301,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un
; IR-DPP: 33:
; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -337,7 +337,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str
; IR-ITERATIVE: 16:
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
@@ -369,7 +369,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str
; IR-DPP: 16:
; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -402,7 +402,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: br label [[TMP18]]
@@ -416,11 +416,11 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float
; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -460,7 +460,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -471,7 +471,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
; IR-DPP: 33:
; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -503,7 +503,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_uni_value_agent_scope_uns
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
; IR-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
+; IR-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]])
; IR-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
; IR-NEXT: [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float
; IR-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]]
@@ -536,7 +536,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]])
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP28:%.*]])
; IR-ITERATIVE-NEXT: br label [[TMP18]]
@@ -550,11 +550,11 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]])
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]])
; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]])
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]])
; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float
; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.minnum.f32(float [[ACCUMULATOR]], float [[TMP24]])
; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -594,7 +594,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.minnum.f32(float [[TMP22]], float [[TMP23]])
; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP24]], i32 312, i32 15, i32 15, i1 false)
; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63)
+; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63)
; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]])
; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -605,7 +605,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns
; IR-DPP: 33:
; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]])
+; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]])
; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]])
; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.minnum.f32(float [[TMP37]], float [[TMP38]])
@@ -637,7 +637,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_uns
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[VAL]]
@@ -665,7 +665,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_uns
; IR-DPP: 12:
; IR-DPP-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-DPP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[VAL]]
@@ -698,7 +698,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: br label [[TMP18]]
@@ -712,11 +712,11 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float
; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.maxnum.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -756,7 +756,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP22]], float [[TMP23]], metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -767,7 +767,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns
; IR-DPP: 33:
; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP37]], float [[TMP38]], metadata !"fpexcept.strict") #[[ATTR8]]
@@ -803,7 +803,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_st
; IR-ITERATIVE: 16:
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
@@ -835,7 +835,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_st
; IR-DPP: 16:
; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -868,7 +868,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: br label [[TMP18]]
@@ -882,11 +882,11 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float
; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -926,7 +926,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -937,7 +937,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
; IR-DPP: 33:
; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -1084,8 +1084,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_agent_s
; IR-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
; IR-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
; IR-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]])
-; IR-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]])
+; IR-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]])
+; IR-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]])
; IR-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
; IR-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
; IR-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1136,8 +1136,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_
; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1174,8 +1174,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_
; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1226,8 +1226,8 @@ define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_s
; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1264,8 +1264,8 @@ define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_s
; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1312,8 +1312,8 @@ define amdgpu_ps double @global_atomic_fmin_double_uni_address_uni_value_agent_s
; IR-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
; IR-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32
; IR-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
-; IR-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]])
-; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]])
+; IR-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]])
+; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]])
; IR-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
; IR-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1
; IR-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double
@@ -1360,8 +1360,8 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
-; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1
; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double
@@ -1394,8 +1394,8 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_
; IR-DPP-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
; IR-DPP-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32
; IR-DPP-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
-; IR-DPP-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
; IR-DPP-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1
; IR-DPP-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double
@@ -1446,8 +1446,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_
; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1484,8 +1484,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_
; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll
index f954560d0f5ca..4b4c99b3cd14c 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll
@@ -83,7 +83,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, ptr addrspace(
; IR-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP6]], [[ENTRY]] ], [ [[TMP16:%.*]], [[COMPUTELOOP]] ]
; IR-NEXT: [[TMP10:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; IR-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
-; IR-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[VALUE]], i32 [[TMP11]])
+; IR-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[VALUE]], i32 [[TMP11]])
; IR-NEXT: [[TMP13]] = add i32 [[ACCUMULATOR]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = shl i64 1, [[TMP10]]
; IR-NEXT: [[TMP15:%.*]] = xor i64 [[TMP14]], -1
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
index 86e3d9338e078..38823681d1bb5 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
@@ -69,7 +69,7 @@ define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) #
; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32
-; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]])
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP13]], i32 [[TMP12]])
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float
; IR-ITERATIVE-NEXT: [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]]
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]]
@@ -107,7 +107,7 @@ define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) #
; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 323, i32 12, i32 15, i1 false)
; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP20]], [[TMP21]]
; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
-; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP23]], i32 63)
+; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP23]], i32 63)
; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float
; IR-DPP-NEXT: [[TMP26:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]])
; IR-DPP-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP6]], 0
@@ -191,7 +191,7 @@ define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) #
; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32
-; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]])
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP13]], i32 [[TMP12]])
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float
; IR-ITERATIVE-NEXT: [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]]
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]]
@@ -229,7 +229,7 @@ define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) #
; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 323, i32 12, i32 15, i1 false)
; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP20]], [[TMP21]]
; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
-; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP23]], i32 63)
+; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP23]], i32 63)
; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float
; IR-DPP-NEXT: [[TMP26:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]])
; IR-DPP-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP6]], 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
index 239fe274d5232..fa66a0fdc76ce 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
@@ -61,7 +61,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_scope_agent_scop
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]])
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]])
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
; IR-ITERATIVE-NEXT: [[TMP19]] = fadd float [[ACCUMULATOR]], [[TMP18]]
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -100,7 +100,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_scope_agent_scop
; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false)
; IR-DPP-NEXT: [[TMP24:%.*]] = fadd float [[TMP22]], [[TMP23]]
; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63)
+; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63)
; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]])
; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -196,7 +196,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_one_as_scope_uns
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -235,7 +235,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_one_as_scope_uns
; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -331,7 +331,7 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -370,7 +370,7 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri
; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -438,7 +438,7 @@ define amdgpu_ps void @global_atomic_fmin_uni_address_div_value_agent_scope_unsa
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]])
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]])
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.minnum.f32(float [[ACCUMULATOR]], float [[TMP18]])
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -477,7 +477,7 @@ define amdgpu_ps void @global_atomic_fmin_uni_address_div_value_agent_scope_unsa
; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP22]], i32 323, i32 12, i32 15, i1 false)
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.minnum.f32(float [[TMP22]], float [[TMP23]])
; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63)
+; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63)
; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]])
; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -565,7 +565,7 @@ define amdgpu_ps void @global_atomic_fmax_uni_address_div_value_agent_scope_unsa
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.maxnum.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -604,7 +604,7 @@ define amdgpu_ps void @global_atomic_fmax_uni_address_div_value_agent_scope_unsa
; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP22]], float [[TMP23]], metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -700,7 +700,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_str
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -739,7 +739,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_str
; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 0284f44f5f14d..cc6c630ae6466 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -1,65 +1,387 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s
declare i32 @llvm.amdgcn.readfirstlane(i32) #0
+declare i64 @llvm.amdgcn.readfirstlane.i64(i64) #0
+declare double @llvm.amdgcn.readfirstlane.f64(double) #0
-; CHECK-LABEL: {{^}}test_readfirstlane:
-; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
-define void @test_readfirstlane(ptr addrspace(1) %out, i32 %src) #1 {
- %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src)
+define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %src)
store i32 %readfirstlane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_readfirstlane_imm:
-; CHECK: s_mov_b32 [[SGPR_VAL:s[0-9]]], 32
-; CHECK-NOT: [[SGPR_VAL]]
-; CHECK: ; use [[SGPR_VAL]]
-define amdgpu_kernel void @test_readfirstlane_imm(ptr addrspace(1) %out) #1 {
- %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
+define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s5
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s4
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %src)
+ store i64 %readfirstlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s5
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s4
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double %src)
+ store double %readfirstlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_readfirstlane_imm_i32(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_mov_b32 s0, 32
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_mov_b32 s0, 32
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_endpgm
+ %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32)
call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
ret void
}
-; CHECK-LABEL: {{^}}test_readfirstlane_imm_fold:
-; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32
-; CHECK-NOT: [[VVAL]]
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
-define amdgpu_kernel void @test_readfirstlane_imm_fold(ptr addrspace(1) %out) #1 {
- %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
+define amdgpu_kernel void @test_readfirstlane_imm_i64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_mov_b64 s[0:1], 32
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[0:1]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_mov_b64 s[0:1], 32
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[0:1]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_endpgm
+ %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 32)
+ call void asm sideeffect "; use $0", "s"(i64 %readfirstlane)
+ ret void
+}
+
+define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_mov_b32 s0, 0
+; CHECK-SDAG-NEXT: s_mov_b32 s1, 0x40400000
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[0:1]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_mov_b32 s0, 0
+; CHECK-GISEL-NEXT: s_mov_b32 s1, 0x40400000
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[0:1]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_endpgm
+ %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double 32.0)
+ call void asm sideeffect "; use $0", "s"(double %readfirstlane)
+ ret void
+}
+
+define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT: s_endpgm
+ %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32)
store i32 %readfirstlane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_readfirstlane_m0:
-; CHECK: s_mov_b32 m0, -1
-; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
+define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 32)
+ store i64 %readfirstlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double 32.0)
+ store double %readfirstlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_m0:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: s_mov_b32 m0, -1
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_m0:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: s_mov_b32 m0, -1
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT: s_endpgm
%m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
%readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0)
store i32 %readfirstlane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_readfirstlane_copy_from_sgpr:
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: s_mov_b32 [[SGPR:s[0-9]+]]
-; CHECK: ;;#ASMEND
-; CHECK-NOT: [[SGPR]]
-; CHECK-NOT: readfirstlane
-; CHECK: v_mov_b32_e32 [[VCOPY:v[0-9]+]], [[SGPR]]
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]]
-define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: s_mov_b32 s2, 0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT: s_endpgm
%sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
- %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %sgpr)
+ %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %sgpr)
store i32 %readfirstlane, ptr addrspace(1) %out, align 4
ret void
}
-; Make sure this doesn't crash.
-; CHECK-LABEL: {{^}}test_readfirstlane_fi:
-; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 0
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %sgpr = call i64 asm "s_mov_b64 $0, 0", "=s"()
+ %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %sgpr)
+ store i64 %readfirstlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %sgpr = call double asm "s_mov_b64 $0, 0", "=s"()
+ %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double %sgpr)
+ store double %readfirstlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_fi:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_add_u32 s0, s0, s9
+; CHECK-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-SDAG-NEXT: s_mov_b32 s4, 0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_fi:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_add_u32 s0, s0, s9
+; CHECK-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-GISEL-NEXT: s_mov_b32 s4, 0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_endpgm
%alloca = alloca i32, addrspace(5)
%int = ptrtoint ptr addrspace(5) %alloca to i32
%readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %int)
@@ -67,5 +389,210 @@ define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 {
ret void
}
+define void @test_readfirstlane_half(ptr addrspace(1) %out, half %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_half:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_half:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call half @llvm.amdgcn.readfirstlane.f16(half %src)
+ call void asm sideeffect "; use $0", "s"(half %x)
+ ret void
+}
+
+define void @test_readfirstlane_float(ptr addrspace(1) %out, float %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_float:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_float:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call float @llvm.amdgcn.readfirstlane.f32(float %src)
+ call void asm sideeffect "; use $0", "s"(float %x)
+ ret void
+}
+
+define void @test_readfirstlane_bfloat(ptr addrspace(1) %out, bfloat %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_bfloat:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_bfloat:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call bfloat @llvm.amdgcn.readfirstlane.bf16(bfloat %src)
+ call void asm sideeffect "; use $0", "s"(bfloat %x)
+ ret void
+}
+
+define void @test_readfirstlane_i16(ptr addrspace(1) %out, i16 %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_i16:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 0xffff
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_i16:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call i16 @llvm.amdgcn.readfirstlane.i16(i16 %src)
+ call void asm sideeffect "; use $0", "s"(i16 %x)
+ ret void
+}
+
+define void @test_readfirstlane_v2f16(ptr addrspace(1) %out, <2 x half> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v2f16:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v2f16:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> %src)
+ call void asm sideeffect "; use $0", "s"(<2 x half> %x)
+ ret void
+}
+
+define void @test_readfirstlane_v2f32(ptr addrspace(1) %out, <2 x float> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v2f32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:5]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v2f32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:5]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <2 x float> @llvm.amdgcn.readfirstlane.v2f32(<2 x float> %src)
+ call void asm sideeffect "; use $0", "s"(<2 x float> %x)
+ ret void
+}
+
+define void @test_readfirstlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v7i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:10]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v7i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:10]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <7 x i32> @llvm.amdgcn.readfirstlane.v7i32(<7 x i32> %src)
+ call void asm sideeffect "; use $0", "s"(<7 x i32> %x)
+ ret void
+}
+
+define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v8i16:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:7]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v8i16:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:7]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <8 x i16> @llvm.amdgcn.readfirstlane.v8i16(<8 x i16> %src)
+ call void asm sideeffect "; use $0", "s"(<8 x i16> %x)
+ ret void
+}
+
attributes #0 = { nounwind readnone convergent }
attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
new file mode 100644
index 0000000000000..3882a5f0f9f4f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
@@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
+
+define void @test_readfirstlane_p0(ptr addrspace(1) %out, ptr %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_p0:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:5]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call ptr @llvm.amdgcn.readfirstlane.p0(ptr %src)
+ call void asm sideeffect "; use $0", "s"(ptr %x)
+ ret void
+}
+
+define void @test_readfirstlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3p0:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:9]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call <3 x ptr> @llvm.amdgcn.readfirstlane.v3p0(<3 x ptr> %src)
+ call void asm sideeffect "; use $0", "s"(<3 x ptr> %x)
+ ret void
+}
+
+define void @test_readfirstlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_p3:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call ptr addrspace(3) @llvm.amdgcn.readfirstlane.p3(ptr addrspace(3) %src)
+ call void asm sideeffect "; use $0", "s"(ptr addrspace(3) %x)
+ ret void
+}
+
+define void @test_readfirstlane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3p3:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:6]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call <3 x ptr addrspace(3)> @llvm.amdgcn.readfirstlane.v3p3(<3 x ptr addrspace(3)> %src)
+ call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(3)> %x)
+ ret void
+}
+
+define void @test_readfirstlane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_p5:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call ptr addrspace(5) @llvm.amdgcn.readfirstlane.p5(ptr addrspace(5) %src)
+ call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %x)
+ ret void
+}
+
+define void @test_readfirstlane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3p5:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:6]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call <3 x ptr addrspace(5)> @llvm.amdgcn.readfirstlane.v3p5(<3 x ptr addrspace(5)> %src)
+ call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(5)> %x)
+ ret void
+}
+
+define void @test_readfirstlane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_p6:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call ptr addrspace(6) @llvm.amdgcn.readfirstlane.p6(ptr addrspace(6) %src)
+ call void asm sideeffect "; use $0", "s"(ptr addrspace(6) %x)
+ ret void
+}
+
+define void @test_readfirstlane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3p6:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:6]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call <3 x ptr addrspace(6)> @llvm.amdgcn.readfirstlane.v3p6(<3 x ptr addrspace(6)> %src)
+ call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(6)> %x)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 51465f6bd10ce..66e1f9396de5a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -1,82 +1,899 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck --check-prefix=CHECK-GISEL -enable-var-scope %s
-declare i32 @llvm.amdgcn.readlane(i32, i32) #0
+declare i32 @llvm.amdgcn.readlane.i32(i32, i32) #0
+declare i64 @llvm.amdgcn.readlane.i64(i64, i32) #0
+declare double @llvm.amdgcn.readlane.f64(double, i32) #0
-; CHECK-LABEL: {{^}}test_readlane_sreg_sreg:
-; CHECK-NOT: v_readlane_b32
-define amdgpu_kernel void @test_readlane_sreg_sreg(i32 %src0, i32 %src1) #1 {
- %readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
+define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_endpgm
+ %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %src0, i32 %src1)
call void asm sideeffect "; use $0", "s"(i32 %readlane)
ret void
}
-; CHECK-LABEL: {{^}}test_readlane_vreg_sreg:
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_readlane_vreg_sreg(i32 %src0, i32 %src1) #1 {
+define amdgpu_kernel void @test_readlane_sreg_sreg_i64(i64 %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[0:1]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[0:1]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_endpgm
+ %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %src0, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(i64 %readlane)
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_sreg_sreg_f64(double %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[0:1]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[0:1]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_endpgm
+ %readlane = call double @llvm.amdgcn.readlane.f64(double %src0, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(double %readlane)
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dword s0, s[4:5], 0x4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; def v0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dword s0, s[4:5], 0x4
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; def v0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readlane_b32 s0, v0, s0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_endpgm
%vgpr = call i32 asm sideeffect "; def $0", "=v"()
- %readlane = call i32 @llvm.amdgcn.readlane(i32 %vgpr, i32 %src1)
+ %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %vgpr, i32 %src1)
call void asm sideeffect "; use $0", "s"(i32 %readlane)
ret void
}
-; CHECK-LABEL: {{^}}test_readlane_imm_sreg:
-; CHECK-NOT: v_readlane_b32
-define amdgpu_kernel void @test_readlane_imm_sreg(ptr addrspace(1) %out, i32 %src1) #1 {
- %readlane = call i32 @llvm.amdgcn.readlane(i32 32, i32 %src1)
+define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dword s0, s[4:5], 0x8
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; def v[0:1]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readlane_b32 s1, v1, s0
+; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[0:1]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dword s1, s[4:5], 0x8
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; def v[0:1]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readlane_b32 s0, v0, s1
+; CHECK-GISEL-NEXT: v_readlane_b32 s1, v1, s1
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[0:1]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_endpgm
+ %vgpr = call i64 asm sideeffect "; def $0", "=v"()
+ %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %vgpr, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(i64 %readlane)
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_vreg_sreg_f64(double %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dword s0, s[4:5], 0x8
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; def v[0:1]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readlane_b32 s1, v1, s0
+; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[0:1]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dword s1, s[4:5], 0x8
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; def v[0:1]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readlane_b32 s0, v0, s1
+; CHECK-GISEL-NEXT: v_readlane_b32 s1, v1, s1
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[0:1]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_endpgm
+ %vgpr = call double asm sideeffect "; def $0", "=v"()
+ %readlane = call double @llvm.amdgcn.readlane.f64(double %vgpr, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(double %readlane)
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT: s_endpgm
+ %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 32, i32 %src1)
store i32 %readlane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_readlane_vregs:
-; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]]
-define amdgpu_kernel void @test_readlane_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 32, i32 %src1)
+ store i64 %readlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_imm_sreg_f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %readlane = call double @llvm.amdgcn.readlane.f64(double 32.0, i32 %src1)
+ store double %readlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vregs_i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; CHECK-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s0, v1
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: flat_store_dword v[2:3], v0
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vregs_i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s2, v1
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.in = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 %tid
%args = load <2 x i32>, ptr addrspace(1) %gep.in
%value = extractelement <2 x i32> %args, i32 0
%lane = extractelement <2 x i32> %args, i32 1
- %readlane = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %lane)
+ %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %value, i32 %lane)
store i32 %readlane, ptr addrspace(1) %out, align 4
ret void
}
-; TODO: m0 should be folded.
-; CHECK-LABEL: {{^}}test_readlane_m0_sreg:
-; CHECK: s_mov_b32 m0, -1
-; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
+define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vregs_i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; CHECK-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-SDAG-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s1, v1, s0
+; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[3:4], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vregs_i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s3, v2
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s3
+; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, s3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.in = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid
+ %args = load <2 x i64>, ptr addrspace(1) %gep.in
+ %value = extractelement <2 x i64> %args, i32 0
+ %lane = extractelement <2 x i64> %args, i32 1
+ %lane32 = trunc i64 %lane to i32
+ %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %value, i32 %lane32)
+ store i64 %readlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vregs_f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; CHECK-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-SDAG-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s1, v1, s0
+; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[3:4], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vregs_f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s3, v2
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s3
+; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, s3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.in = getelementptr <2 x double>, ptr addrspace(1) %in, i32 %tid
+ %args = load <2 x double>, ptr addrspace(1) %gep.in
+ %value = extractelement <2 x double> %args, i32 0
+ %lane = extractelement <2 x double> %args, i32 1
+ %lane_cast = bitcast double %lane to i64
+ %lane32 = trunc i64 %lane_cast to i32
+ %readlane = call double @llvm.amdgcn.readlane.f64(double %value, i32 %lane32)
+ store double %readlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_m0_sreg:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: s_mov_b32 m0, -1
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_m0_sreg:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: s_mov_b32 m0, -1
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT: s_endpgm
%m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
%readlane = call i32 @llvm.amdgcn.readlane(i32 %m0, i32 %src1)
store i32 %readlane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_readlane_vgpr_imm:
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32
-define amdgpu_kernel void @test_readlane_vgpr_imm(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; def v0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_readlane_b32 s2, v0, 32
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; def v0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT: s_endpgm
%vgpr = call i32 asm sideeffect "; def $0", "=v"()
- %readlane = call i32 @llvm.amdgcn.readlane(i32 %vgpr, i32 32) #0
+ %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %vgpr, i32 32) #0
store i32 %readlane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_readlane_copy_from_sgpr:
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: s_mov_b32 [[SGPR:s[0-9]+]]
-; CHECK: ;;#ASMEND
-; CHECK-NOT: [[SGPR]]
-; CHECK-NOT: readlane
-; CHECK: v_mov_b32_e32 [[VCOPY:v[0-9]+]], [[SGPR]]
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]]
-define amdgpu_kernel void @test_readlane_copy_from_sgpr(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; def v[0:1]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32
+; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; def v[0:1]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %vgpr = call i64 asm sideeffect "; def $0", "=v"()
+ %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %vgpr, i32 32) #0
+ store i64 %readlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; def v[0:1]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32
+; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; def v[0:1]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %vgpr = call double asm sideeffect "; def $0", "=v"()
+ %readlane = call double @llvm.amdgcn.readlane.f64(double %vgpr, i32 32) #0
+ store double %readlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: s_mov_b32 s2, 0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT: s_endpgm
%sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
- %readfirstlane = call i32 @llvm.amdgcn.readlane(i32 %sgpr, i32 7)
+ %readfirstlane = call i32 @llvm.amdgcn.readlane.i32(i32 %sgpr, i32 7)
store i32 %readfirstlane, ptr addrspace(1) %out, align 4
ret void
}
+define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %sgpr = call i64 asm "s_mov_b64 $0, 0", "=s"()
+ %readfirstlane = call i64 @llvm.amdgcn.readlane.i64(i64 %sgpr, i32 7)
+ store i64 %readfirstlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %sgpr = call double asm "s_mov_b64 $0, 0", "=s"()
+ %readfirstlane = call double @llvm.amdgcn.readlane.f64(double %sgpr, i32 7)
+ store double %readfirstlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_readlane_half(ptr addrspace(1) %out, half %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_half:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_half:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call half @llvm.amdgcn.readlane.f16(half %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(half %x)
+ ret void
+}
+
+define void @test_readlane_float(ptr addrspace(1) %out, float %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_float:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_float:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call float @llvm.amdgcn.readlane.f32(float %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(float %x)
+ ret void
+}
+
+define void @test_readlane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_bfloat:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_bfloat:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call bfloat @llvm.amdgcn.readlane.bf16(bfloat %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(bfloat %x)
+ ret void
+}
+
+define void @test_readlane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_i16:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 0xffff
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_i16:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call i16 @llvm.amdgcn.readlane.i16(i16 %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(i16 %x)
+ ret void
+}
+
+define void @test_readlane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v2f16:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v2f16:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <2 x half> @llvm.amdgcn.readlane.v2f16(<2 x half> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<2 x half> %x)
+ ret void
+}
+
+define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v2f32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:5]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v2f32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v4
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s5
+; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s5
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:5]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <2 x float> @llvm.amdgcn.readlane.v2f32(<2 x float> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<2 x float> %x)
+ ret void
+}
+
+define void @test_readlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v7i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v9
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s10, v8, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:10]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v7i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v9
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s10
+; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s10
+; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s10
+; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s10
+; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s10
+; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s10
+; CHECK-GISEL-NEXT: v_readlane_b32 s10, v8, s10
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:10]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <7 x i32> @llvm.amdgcn.readlane.v7i32(<7 x i32> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<7 x i32> %x)
+ ret void
+}
+
+define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v8i16:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v6
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:7]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v8i16:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v6
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s7
+; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s7
+; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s7
+; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s7
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:7]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<8 x i16> %x)
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
new file mode 100644
index 0000000000000..49f8ef391c230
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s
+
+define void @test_readlane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_p0:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:5]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call ptr @llvm.amdgcn.readlane.p0(ptr %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(ptr %x)
+ ret void
+}
+
+define void @test_readlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v3p0:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v8
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:9]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call <3 x ptr> @llvm.amdgcn.readlane.v3p0(<3 x ptr> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<3 x ptr> %x)
+ ret void
+}
+
+define void @test_readlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_p3:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call ptr addrspace(3) @llvm.amdgcn.readlane.p3(ptr addrspace(3) %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(ptr addrspace(3) %x)
+ ret void
+}
+
+define void @test_readlane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v3p3:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:6]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call <3 x ptr addrspace(3)> @llvm.amdgcn.readlane.v3p3(<3 x ptr addrspace(3)> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(3)> %x)
+ ret void
+}
+
+define void @test_readlane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_p5:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call ptr addrspace(5) @llvm.amdgcn.readlane.p5(ptr addrspace(5) %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %x)
+ ret void
+}
+
+define void @test_readlane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v3p5:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:6]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call <3 x ptr addrspace(5)> @llvm.amdgcn.readlane.v3p5(<3 x ptr addrspace(5)> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(5)> %x)
+ ret void
+}
+
+define void @test_readlane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_p6:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call ptr addrspace(6) @llvm.amdgcn.readlane.p6(ptr addrspace(6) %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(ptr addrspace(6) %x)
+ ret void
+}
+
+define void @test_readlane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v3p6:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:6]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call <3 x ptr addrspace(6)> @llvm.amdgcn.readlane.v3p6(<3 x ptr addrspace(6)> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(6)> %x)
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 37951669dbe75..31f1085dd76ee 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -1,85 +1,2767 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,CIGFX9 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,CIGFX9 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX802-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s
+
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX802-GISEL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX1010-GISEL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 -global-isel < %s | FileCheck -check-prefixes=GFX1100-GISEL %s
declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0
+declare i64 @llvm.amdgcn.writelane.i64(i64, i32, i64) #0
+declare double @llvm.amdgcn.writelane.f64(double, i32, double) #0
-; CHECK-LABEL: {{^}}test_writelane_sreg:
-; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
-; GFX10: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_sreg(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
+define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_sreg_i32:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s3
+; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3
+; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, m0
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_i32:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_i32:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_i32:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s3
+; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, m0
+; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_i32:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_i32:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
%oldval = load i32, ptr addrspace(1) %out
- %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
+ %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 %oldval)
store i32 %writelane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_writelane_imm_sreg:
-; CHECK: v_writelane_b32 v{{[0-9]+}}, 32, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_imm_sreg(ptr addrspace(1) %out, i32 %src1) #1 {
+define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_sreg_i64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_i64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_i64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x10
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s2
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s2
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_i64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_i64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_i64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x10
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s2
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s2
+; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %oldval = load i64, ptr addrspace(1) %out
+ %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 %oldval)
+ store i64 %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_sreg_f64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_f64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_f64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x10
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s2
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s2
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_f64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_f64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_f64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x10
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s2
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s2
+; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %oldval = load double, ptr addrspace(1) %out
+ %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double %oldval)
+ store double %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_sreg_i32:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3
+; GFX802-SDAG-NEXT: v_writelane_b32 v2, 32, s2
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_i32:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 32, s2
+; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_i32:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 32, s0
+; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i32:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT: v_writelane_b32 v2, 32, s2
+; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_i32:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s3
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 32, s2
+; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_i32:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 32, s0
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
%oldval = load i32, ptr addrspace(1) %out
- %writelane = call i32 @llvm.amdgcn.writelane(i32 32, i32 %src1, i32 %oldval)
+ %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 32, i32 %src1, i32 %oldval)
store i32 %writelane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_writelane_vreg_lane:
-; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
-; CHECK: v_writelane_b32 v{{[0-9]+}}, 12, [[LANE]]
-define amdgpu_kernel void @test_writelane_vreg_lane(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_sreg_i64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x8
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, 0, s4
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, 32, s4
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_i64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x8
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, 0, s6
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 32, s6
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_i64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x8
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, 0, s4
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 32, s4
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_load_dword s4, s[4:5], 0x8
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-GISEL-NEXT: v_writelane_b32 v0, 32, s4
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, 0, s4
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_i64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x8
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 32, s6
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, 0, s6
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_i64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x8
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 32, s4
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, 0, s4
+; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %oldval = load i64, ptr addrspace(1) %out
+ %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 32, i32 %src1, i64 %oldval)
+ store i64 %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_sreg_f64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x8
+; GFX802-SDAG-NEXT: s_mov_b32 s5, 0x40400000
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, 0, s4
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_f64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x8
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX1010-SDAG-NEXT: s_mov_b32 s2, 0x40400000
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s2, s6
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 0, s6
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_f64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x8
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX1100-SDAG-NEXT: s_mov_b32 s0, 0x40400000
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s0, s4
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 0, s4
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_sreg_f64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_load_dword s4, s[4:5], 0x8
+; GFX802-GISEL-NEXT: s_mov_b32 s5, 0x40400000
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s4
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-GISEL-NEXT: v_writelane_b32 v0, 0, s4
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_f64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x8
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1010-GISEL-NEXT: s_mov_b32 s2, 0x40400000
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 0, s6
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s2, s6
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_f64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x8
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1100-GISEL-NEXT: s_mov_b32 s0, 0x40400000
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 0, s4
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s0, s4
+; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %oldval = load double, ptr addrspace(1) %out
+ %writelane = call double @llvm.amdgcn.writelane.f64(double 32.0, i32 %src1, double %oldval)
+ store double %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GFX802-SDAG-LABEL: test_writelane_vreg_lane_i32:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, 4, v0
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: flat_load_dword v0, v[0:1]
+; GFX802-SDAG-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT: s_nop 2
+; GFX802-SDAG-NEXT: v_writelane_b32 v2, 12, s2
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_i32:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dword v0, v0, s[2:3] offset:4
+; GFX1010-SDAG-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-SDAG-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, 12, s2
+; GFX1010-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_i32:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] offset:4
+; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, 12, s2
+; GFX1100-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i32:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, 4, v0
+; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT: flat_load_dword v0, v[0:1]
+; GFX802-GISEL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT: s_nop 2
+; GFX802-GISEL-NEXT: v_writelane_b32 v2, 12, s2
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_i32:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_dword v0, v0, s[2:3] offset:4
+; GFX1010-GISEL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-GISEL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, 12, s2
+; GFX1010-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_i32:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] offset:4
+; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, 12, s2
+; GFX1100-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.in = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 %tid
%args = load <2 x i32>, ptr addrspace(1) %gep.in
%oldval = load i32, ptr addrspace(1) %out
%lane = extractelement <2 x i32> %args, i32 1
- %writelane = call i32 @llvm.amdgcn.writelane(i32 12, i32 %lane, i32 %oldval)
+ %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 12, i32 %lane, i32 %oldval)
store i32 %writelane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_writelane_m0_sreg:
-; CHECK: s_mov_b32 m0, -1
-; CIGFX9: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
-; CIGFX9: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], m0
-; GFX10: v_writelane_b32 v{{[0-9]+}}, m0, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_m0_sreg(ptr addrspace(1) %out, i32 %src1) #1 {
+define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GFX802-SDAG-LABEL: test_writelane_vreg_lane_i64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, 8, v0
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1]
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s2, v2
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: s_nop 2
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, 0, s2
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, 12, s2
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_i64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dword v0, v0, s[2:3] offset:8
+; GFX1010-SDAG-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, 0, s3
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 12, s3
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_i64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] offset:8
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, 0, s3
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 12, s3
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, 8, v0
+; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s0
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX802-GISEL-NEXT: s_nop 3
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, 12, s2
+; GFX802-GISEL-NEXT: v_writelane_b32 v2, 0, s2
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_i64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:8
+; GFX1010-GISEL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, 12, s2
+; GFX1010-GISEL-NEXT: v_writelane_b32 v2, 0, s2
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_i64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:8
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, 12, s2
+; GFX1100-GISEL-NEXT: v_writelane_b32 v2, 0, s2
+; GFX1100-GISEL-NEXT: global_store_b64 v0, v[1:2], s[0:1]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.in = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid
+ %args = load <2 x i64>, ptr addrspace(1) %gep.in
+ %oldval = load i64, ptr addrspace(1) %out
+ %lane = extractelement <2 x i64> %args, i32 1
+ %lane32 = trunc i64 %lane to i32
+ %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 12, i32 %lane32, i64 %oldval)
+ store i64 %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GFX802-SDAG-LABEL: test_writelane_vreg_lane_f64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, 8, v0
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1]
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v2
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s2, v2
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, s4, m0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, 0, s2
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_f64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dword v0, v0, s[2:3] offset:8
+; GFX1010-SDAG-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX1010-SDAG-NEXT: s_mov_b32 s2, 0x40280000
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s2, s3
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 0, s3
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_f64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] offset:8
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX1100-SDAG-NEXT: s_mov_b32 s2, 0x40280000
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s2, s3
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 0, s3
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_vreg_lane_f64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, 8, v0
+; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s0
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s2
+; GFX802-GISEL-NEXT: s_nop 2
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, 0, s2
+; GFX802-GISEL-NEXT: v_writelane_b32 v2, s4, m0
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_f64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:8
+; GFX1010-GISEL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX1010-GISEL-NEXT: s_mov_b32 s3, 0x40280000
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, 0, s2
+; GFX1010-GISEL-NEXT: v_writelane_b32 v2, s3, s2
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_f64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:8
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX1100-GISEL-NEXT: s_mov_b32 s3, 0x40280000
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, 0, s2
+; GFX1100-GISEL-NEXT: v_writelane_b32 v2, s3, s2
+; GFX1100-GISEL-NEXT: global_store_b64 v0, v[1:2], s[0:1]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.in = getelementptr <2 x double>, ptr addrspace(1) %in, i32 %tid
+ %args = load <2 x double>, ptr addrspace(1) %gep.in
+ %oldval = load double, ptr addrspace(1) %out
+ %lane = extractelement <2 x double> %args, i32 1
+ %lane_cast = bitcast double %lane to i64
+ %lane32 = trunc i64 %lane_cast to i32
+ %writelane = call double @llvm.amdgcn.writelane.f64(double 12.0, i32 %lane32, double %oldval)
+ store double %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_m0_sreg_i32:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX802-SDAG-NEXT: ;;#ASMSTART
+; GFX802-SDAG-NEXT: s_mov_b32 m0, -1
+; GFX802-SDAG-NEXT: ;;#ASMEND
+; GFX802-SDAG-NEXT: s_mov_b32 s4, m0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s2
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3
+; GFX802-SDAG-NEXT: v_writelane_b32 v2, s4, m0
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_m0_sreg_i32:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT: ;;#ASMSTART
+; GFX1010-SDAG-NEXT: s_mov_b32 m0, -1
+; GFX1010-SDAG-NEXT: ;;#ASMEND
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, m0, s2
+; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_m0_sreg_i32:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT: ;;#ASMSTART
+; GFX1100-SDAG-NEXT: s_mov_b32 m0, -1
+; GFX1100-SDAG-NEXT: ;;#ASMEND
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, m0, s0
+; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_m0_sreg_i32:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX802-GISEL-NEXT: ;;#ASMSTART
+; GFX802-GISEL-NEXT: s_mov_b32 m0, -1
+; GFX802-GISEL-NEXT: ;;#ASMEND
+; GFX802-GISEL-NEXT: s_mov_b32 s4, m0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s2
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT: v_writelane_b32 v2, s4, m0
+; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_m0_sreg_i32:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX1010-GISEL-NEXT: ;;#ASMSTART
+; GFX1010-GISEL-NEXT: s_mov_b32 m0, -1
+; GFX1010-GISEL-NEXT: ;;#ASMEND
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s3
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, m0, s2
+; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_m0_sreg_i32:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX1100-GISEL-NEXT: ;;#ASMSTART
+; GFX1100-GISEL-NEXT: s_mov_b32 m0, -1
+; GFX1100-GISEL-NEXT: ;;#ASMEND
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, m0, s0
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
%oldval = load i32, ptr addrspace(1) %out
%m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
- %writelane = call i32 @llvm.amdgcn.writelane(i32 %m0, i32 %src1, i32 %oldval)
+ %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %m0, i32 %src1, i32 %oldval)
store i32 %writelane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_writelane_imm:
-; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 32
-define amdgpu_kernel void @test_writelane_imm(ptr addrspace(1) %out, i32 %src0) #1 {
+define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %src0) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_i32:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3
+; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, 32
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_i32:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, 32
+; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_i32:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s0, 32
+; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_i32:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, 32
+; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_i32:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s3
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, 32
+; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_i32:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s0, 32
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
%oldval = load i32, ptr addrspace(1) %out
- %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 32, i32 %oldval) #0
+ %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 32, i32 %oldval) #0
store i32 %writelane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_writelane_sreg_oldval:
-; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], s{{[0-9]+}}
-; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0
-; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_sreg_oldval(i32 inreg %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
- %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
+define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %src0) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_i64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, 32
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, 32
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_i64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, 32
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, 32
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_i64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, 32
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, 32
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_i64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, 32
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, 32
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_i64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, 32
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, 32
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_i64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, 32
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, 32
+; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %oldval = load i64, ptr addrspace(1) %out
+ %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 32, i64 %oldval) #0
+ store i64 %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double %src0) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_f64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, 32
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, 32
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_f64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, 32
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, 32
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_f64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, 32
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, 32
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_f64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, 32
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, 32
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_f64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, 32
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, 32
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_f64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, 32
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, 32
+; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %oldval = load double, ptr addrspace(1) %out
+ %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 32, double %oldval) #0
+ store double %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i32:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s6
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s3
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, m0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i32:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_i32:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x8
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i32:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s3
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i32:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_i32:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x8
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 %oldval)
store i32 %writelane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_writelane_imm_oldval:
-; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], 42
-; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0
-; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_imm_oldval(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
- %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 42)
+define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x18
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v2, s4, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x2
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX1010-SDAG-NEXT: s_load_dword s8, s[4:5], 0x18
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s7, s8
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s6, s8
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_i64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x2
+; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x10
+; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x18
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, s0
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s0
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[6:7]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x18
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x2
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX1010-GISEL-NEXT: s_load_dword s8, s[4:5], 0x18
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s6, s8
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s7, s8
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_i64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x2
+; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x10
+; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x18
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s0
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, s0
+; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[6:7]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 %oldval)
+ store i64 %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_f64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x18
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v2, s4, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_f64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x2
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX1010-SDAG-NEXT: s_load_dword s8, s[4:5], 0x18
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s7, s8
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s6, s8
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_f64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x2
+; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x10
+; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x18
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, s0
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s0
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[6:7]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_f64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x18
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_f64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x2
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX1010-GISEL-NEXT: s_load_dword s8, s[4:5], 0x18
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s6, s8
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s7, s8
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_f64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x2
+; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x10
+; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x18
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s0
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, s0
+; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[6:7]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double %oldval)
+ store double %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, 42
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s3
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, m0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i32:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 42
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_i32:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 42
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, 42
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s3
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_i32:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 42
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_i32:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 42
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 42)
store i32 %writelane, ptr addrspace(1) %out, align 4
ret void
}
+define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x10
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 42
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 42
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_i64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x10
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 42
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s0
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s0
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 42
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_i64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 42
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_i64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x10
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 42
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s0
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s0
+; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 42)
+ store i64 %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_oldval_f64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x10
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_f64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_f64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x10
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s0
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s0
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_oldval_f64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_f64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_f64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x10
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s0
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s0
+; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double 42.0)
+ store double %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_half(ptr addrspace(1) %out, half %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_half:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_ushort v4, v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT: flat_store_short v[0:1], v4
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_half:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_ushort v4, v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT: global_store_short v[0:1], v4, off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_half:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_u16 v4, v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_half:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: flat_load_ushort v4, v[0:1]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT: flat_store_short v[0:1], v4
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_half:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_ushort v4, v[0:1], off
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT: global_store_short v[0:1], v4, off
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_half:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_u16 v4, v[0:1], off
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT: global_store_b16 v[0:1], v4, off
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load half, ptr addrspace(1) %out
+ %writelane = call half @llvm.amdgcn.writelane.f16(half %src, i32 %src1, half %oldval)
+ store half %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_float(ptr addrspace(1) %out, float %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_float:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_float:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT: global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_float:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_float:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: flat_load_dword v4, v[0:1]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v4
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_float:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_dword v4, v[0:1], off
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT: global_store_dword v[0:1], v4, off
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_float:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_b32 v4, v[0:1], off
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load float, ptr addrspace(1) %out
+ %writelane = call float @llvm.amdgcn.writelane.f32(float %src, i32 %src1, float %oldval)
+ store float %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_bfloat:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_ushort v4, v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT: flat_store_short v[0:1], v4
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_bfloat:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_ushort v4, v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT: global_store_short v[0:1], v4, off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_bfloat:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_u16 v4, v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_bfloat:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: flat_load_ushort v4, v[0:1]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT: flat_store_short v[0:1], v4
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_bfloat:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_ushort v4, v[0:1], off
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT: global_store_short v[0:1], v4, off
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_bfloat:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_u16 v4, v[0:1], off
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT: global_store_b16 v[0:1], v4, off
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load bfloat, ptr addrspace(1) %out
+ %writelane = call bfloat @llvm.amdgcn.writelane.bf16(bfloat %src, i32 %src1, bfloat %oldval)
+ store bfloat %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_i16:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_ushort v4, v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT: flat_store_short v[0:1], v4
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_i16:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_ushort v4, v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT: global_store_short v[0:1], v4, off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_i16:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_u16 v4, v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_i16:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: flat_load_ushort v4, v[0:1]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT: flat_store_short v[0:1], v4
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_i16:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_ushort v4, v[0:1], off
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT: global_store_short v[0:1], v4, off
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_i16:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_u16 v4, v[0:1], off
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT: global_store_b16 v[0:1], v4, off
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load i16, ptr addrspace(1) %out
+ %writelane = call i16 @llvm.amdgcn.writelane.i16(i16 %src, i32 %src1, i16 %oldval)
+ store i16 %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v2f16:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v2f16:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT: global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v2f16:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_v2f16:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: flat_load_dword v4, v[0:1]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v4
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_v2f16:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_dword v4, v[0:1], off
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT: global_store_dword v[0:1], v4, off
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_v2f16:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_b32 v4, v[0:1], off
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <2 x half>, ptr addrspace(1) %out
+ %writelane = call <2 x half> @llvm.amdgcn.writelane.v2f16(<2 x half> %src, i32 %src1, <2 x half> %oldval)
+ store <2 x half> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_readlane_v2f32:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dwordx2 v[5:6], v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 0
+; GFX802-SDAG-NEXT: v_writelane_b32 v6, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v5, s5, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[5:6]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_readlane_v2f32:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dwordx2 v[5:6], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v2
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v6, s4, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v5, s6, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v[0:1], v[5:6], off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_readlane_v2f32:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b64 v[5:6], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v2
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s0, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v5, s2, s1
+; GFX1100-SDAG-NEXT: global_store_b64 v[0:1], v[5:6], off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_readlane_v2f32:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: flat_load_dwordx2 v[5:6], v[0:1]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v4
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v5, s4, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v6, s6, m0
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[5:6]
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_readlane_v2f32:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_dwordx2 v[5:6], v[0:1], off
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v4
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v5, s4, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v6, s6, s5
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v[0:1], v[5:6], off
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_readlane_v2f32:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_b64 v[5:6], v[0:1], off
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v4
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v5, s0, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v6, s2, s1
+; GFX1100-GISEL-NEXT: global_store_b64 v[0:1], v[5:6], off
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <2 x float>, ptr addrspace(1) %out
+ %writelane = call <2 x float> @llvm.amdgcn.writelane.v2f32(<2 x float> %src, i32 %src1, <2 x float> %oldval)
+ store <2 x float> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v7i32:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_add_u32_e32 v17, vcc, 16, v0
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[10:13], v[0:1]
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: flat_load_dwordx3 v[14:16], v[17:18]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v9
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v5
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v2
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v8
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v7
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v6
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX802-SDAG-NEXT: v_writelane_b32 v13, s7, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v12, s8, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v11, s9, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v10, s10, m0
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_writelane_b32 v16, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v15, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v14, s6, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
+; GFX802-SDAG-NEXT: flat_store_dwordx3 v[17:18], v[14:16]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v7i32:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: global_load_dwordx3 v[14:16], v[0:1], off offset:16
+; GFX1010-SDAG-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v9
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v5
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s9, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s10, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s11, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v8
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v7
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v6
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v16, s4, s5
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v13, s8, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v12, s9, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v11, s10, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v10, s11, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v15, s6, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v14, s7, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[10:13], off
+; GFX1010-SDAG-NEXT: global_store_dwordx3 v[0:1], v[14:16], off offset:16
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v7i32:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: global_load_b96 v[14:16], v[0:1], off offset:16
+; GFX1100-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v9
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s5, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s7, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v8
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v7
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v6
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v16, s0, s1
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v13, s4, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v12, s5, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v11, s6, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v10, s7, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v15, s2, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v14, s3, s1
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off
+; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[14:16], off offset:16
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_v7i32:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_add_u32_e32 v18, vcc, 16, v0
+; GFX802-GISEL-NEXT: flat_load_dwordx4 v[10:13], v[0:1]
+; GFX802-GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT: flat_load_dwordx4 v[14:17], v[18:19]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v9
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s9, v6
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s10, v7
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s11, v8
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1)
+; GFX802-GISEL-NEXT: v_writelane_b32 v10, s4, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v11, s6, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v12, s7, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v13, s8, m0
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v14, s9, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v15, s10, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v16, s11, m0
+; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
+; GFX802-GISEL-NEXT: flat_store_dwordx3 v[18:19], v[14:16]
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_v7i32:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
+; GFX1010-GISEL-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:16
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v9
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v4
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v5
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s9, v6
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s10, v7
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s11, v8
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(1)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v10, s4, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v11, s6, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v12, s7, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v13, s8, s5
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v14, s9, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v15, s10, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v16, s11, s5
+; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off
+; GFX1010-GISEL-NEXT: global_store_dwordx3 v[0:1], v[14:16], off offset:16
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_v7i32:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off
+; GFX1100-GISEL-NEXT: global_load_b128 v[14:17], v[0:1], off offset:16
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v9
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v5
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s5, v6
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s6, v7
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s7, v8
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(1)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v10, s0, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v11, s2, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v12, s3, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v13, s4, s1
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v14, s5, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v15, s6, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v16, s7, s1
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off
+; GFX1100-GISEL-NEXT: global_store_b96 v[0:1], v[14:16], off offset:16
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <7 x i32>, ptr addrspace(1) %out
+ %writelane = call <7 x i32> @llvm.amdgcn.writelane.v7i32(<7 x i32> %src, i32 %src1, <7 x i32> %oldval)
+ store <7 x i32> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v8i16:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[7:10], v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v6
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_writelane_b32 v10, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v9, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v8, s6, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v7, s7, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v8i16:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dwordx4 v[7:10], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v6
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v2
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v10, s4, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v9, s6, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v8, s7, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v7, s8, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v8i16:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b128 v[7:10], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v5
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v6
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v10, s0, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v9, s2, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s3, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s4, s1
+; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[7:10], off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_v8i16:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: flat_load_dwordx4 v[7:10], v[0:1]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v6
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v7, s4, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v8, s6, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v9, s7, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v10, s8, m0
+; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_v8i16:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_dwordx4 v[7:10], v[0:1], off
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v6
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v4
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v5
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v7, s4, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v8, s6, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v9, s7, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v10, s8, s5
+; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_v8i16:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_b128 v[7:10], v[0:1], off
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v6
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v5
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v7, s0, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v8, s2, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v9, s3, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v10, s4, s1
+; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[7:10], off
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <8 x i16>, ptr addrspace(1) %out
+ %writelane = call <8 x i16> @llvm.amdgcn.writelane.v8i16(<8 x i16> %src, i32 %src1, <8 x i16> %oldval)
+ store <8 x i16> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
new file mode 100644
index 0000000000000..edc1afe410a63
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
@@ -0,0 +1,425 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX802-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s
+
+define void @test_writelane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_p0:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dwordx2 v[5:6], v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 0
+; GFX802-SDAG-NEXT: v_writelane_b32 v6, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v5, s5, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[5:6]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_p0:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dwordx2 v[5:6], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v2
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v6, s4, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v5, s6, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v[0:1], v[5:6], off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_p0:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b64 v[5:6], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v2
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s0, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v5, s2, s1
+; GFX1100-SDAG-NEXT: global_store_b64 v[0:1], v[5:6], off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load ptr, ptr addrspace(1) %out
+ %writelane = call ptr @llvm.amdgcn.writelane.p0(ptr %src, i32 %src1, ptr %oldval)
+ store ptr %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v3p0:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_add_u32_e32 v13, vcc, 16, v0
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[9:12], v[0:1]
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: flat_load_dwordx2 v[15:16], v[13:14]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v8
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v5
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v2
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v7
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v6
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX802-SDAG-NEXT: v_writelane_b32 v12, s6, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v11, s7, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v10, s8, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v9, s9, m0
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_writelane_b32 v16, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v15, s5, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[9:12]
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[13:14], v[15:16]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v3p0:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: global_load_dwordx2 v[13:14], v[0:1], off offset:16
+; GFX1010-SDAG-NEXT: global_load_dwordx4 v[9:12], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v8
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v5
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s9, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s10, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v7
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v6
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v14, s4, s5
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v12, s7, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v11, s8, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v10, s9, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v9, s10, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v13, s6, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[9:12], off
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v[0:1], v[13:14], off offset:16
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v3p0:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: global_load_b64 v[13:14], v[0:1], off offset:16
+; GFX1100-SDAG-NEXT: global_load_b128 v[9:12], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v8
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v5
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s6, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v7
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v6
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v14, s0, s1
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v12, s3, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v11, s4, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v10, s5, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v9, s6, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v13, s2, s1
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[9:12], off
+; GFX1100-SDAG-NEXT: global_store_b64 v[0:1], v[13:14], off offset:16
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <3 x ptr>, ptr addrspace(1) %out
+ %writelane = call <3 x ptr> @llvm.amdgcn.writelane.v3p0(<3 x ptr> %src, i32 %src1, <3 x ptr> %oldval)
+ store <3 x ptr> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_p3:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_p3:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT: global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_p3:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load ptr addrspace(3), ptr addrspace(1) %out
+ %writelane = call ptr addrspace(3) @llvm.amdgcn.writelane.p3(ptr addrspace(3) %src, i32 %src1, ptr addrspace(3) %oldval)
+ store ptr addrspace(3) %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v3p3:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dwordx3 v[6:8], v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v5
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_writelane_b32 v8, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v7, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v6, s6, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx3 v[0:1], v[6:8]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v3p3:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dwordx3 v[6:8], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v5
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v2
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v8, s4, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v7, s6, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v6, s7, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx3 v[0:1], v[6:8], off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v3p3:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b96 v[6:8], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v5
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v2
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s0, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s2, s1
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s3, s1
+; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <3 x ptr addrspace(3)>, ptr addrspace(1) %out
+ %writelane = call <3 x ptr addrspace(3)> @llvm.amdgcn.writelane.v3p3(<3 x ptr addrspace(3)> %src, i32 %src1, <3 x ptr addrspace(3)> %oldval)
+ store <3 x ptr addrspace(3)> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_p5:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_p5:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT: global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_p5:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load ptr addrspace(5), ptr addrspace(1) %out
+ %writelane = call ptr addrspace(5) @llvm.amdgcn.writelane.p5(ptr addrspace(5) %src, i32 %src1, ptr addrspace(5) %oldval)
+ store ptr addrspace(5) %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v3p5:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dwordx3 v[6:8], v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v5
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_writelane_b32 v8, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v7, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v6, s6, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx3 v[0:1], v[6:8]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v3p5:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dwordx3 v[6:8], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v5
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v2
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v8, s4, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v7, s6, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v6, s7, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx3 v[0:1], v[6:8], off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v3p5:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b96 v[6:8], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v5
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v2
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s0, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s2, s1
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s3, s1
+; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <3 x ptr addrspace(5)>, ptr addrspace(1) %out
+ %writelane = call <3 x ptr addrspace(5)> @llvm.amdgcn.writelane.v3p5(<3 x ptr addrspace(5)> %src, i32 %src1, <3 x ptr addrspace(5)> %oldval)
+ store <3 x ptr addrspace(5)> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_p6:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_p6:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT: global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_p6:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load ptr addrspace(6), ptr addrspace(1) %out
+ %writelane = call ptr addrspace(6) @llvm.amdgcn.writelane.p6(ptr addrspace(6) %src, i32 %src1, ptr addrspace(6) %oldval)
+ store ptr addrspace(6) %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v3p6:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dwordx3 v[6:8], v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v5
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_writelane_b32 v8, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v7, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v6, s6, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx3 v[0:1], v[6:8]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v3p6:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dwordx3 v[6:8], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v5
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v2
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v8, s4, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v7, s6, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v6, s7, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx3 v[0:1], v[6:8], off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v3p6:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b96 v[6:8], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v5
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v2
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s0, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s2, s1
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s3, s1
+; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <3 x ptr addrspace(6)>, ptr addrspace(1) %out
+ %writelane = call <3 x ptr addrspace(6)> @llvm.amdgcn.writelane.v3p6(<3 x ptr addrspace(6)> %src, i32 %src1, <3 x ptr addrspace(6)> %oldval)
+ store <3 x ptr addrspace(6)> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index 94c32e3cbe99f..483ea8ad57d1b 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -2714,7 +2714,7 @@ declare i32 @llvm.amdgcn.readfirstlane(i32)
define amdgpu_kernel void @readfirstlane_constant(i32 %arg) {
; CHECK-LABEL: @readfirstlane_constant(
-; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
; CHECK-NEXT: store volatile i32 [[VAR]], ptr undef, align 4
; CHECK-NEXT: store volatile i32 0, ptr undef, align 4
; CHECK-NEXT: store volatile i32 123, ptr undef, align 4
@@ -2737,7 +2737,7 @@ define amdgpu_kernel void @readfirstlane_constant(i32 %arg) {
define i32 @readfirstlane_idempotent(i32 %arg) {
; CHECK-LABEL: @readfirstlane_idempotent(
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
; CHECK-NEXT: ret i32 [[READ0]]
;
%read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
@@ -2748,7 +2748,7 @@ define i32 @readfirstlane_idempotent(i32 %arg) {
define i32 @readfirstlane_readlane(i32 %arg) {
; CHECK-LABEL: @readfirstlane_readlane(
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
; CHECK-NEXT: ret i32 [[READ0]]
;
%read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
@@ -2759,10 +2759,10 @@ define i32 @readfirstlane_readlane(i32 %arg) {
define i32 @readfirstlane_readfirstlane_
diff erent_block(i32 %arg) {
; CHECK-LABEL: @readfirstlane_readfirstlane_
diff erent_block(
; CHECK-NEXT: bb0:
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]])
+; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[READ0]])
; CHECK-NEXT: ret i32 [[READ1]]
;
bb0:
@@ -2777,10 +2777,10 @@ bb1:
define i32 @readfirstlane_readlane_
diff erent_block(i32 %arg) {
; CHECK-LABEL: @readfirstlane_readlane_
diff erent_block(
; CHECK-NEXT: bb0:
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 0)
+; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 0)
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]])
+; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[READ0]])
; CHECK-NEXT: ret i32 [[READ1]]
;
bb0:
@@ -2800,7 +2800,7 @@ declare i32 @llvm.amdgcn.readlane(i32, i32)
define amdgpu_kernel void @readlane_constant(i32 %arg, i32 %lane) {
; CHECK-LABEL: @readlane_constant(
-; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 7)
+; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 7)
; CHECK-NEXT: store volatile i32 [[VAR]], ptr undef, align 4
; CHECK-NEXT: store volatile i32 0, ptr undef, align 4
; CHECK-NEXT: store volatile i32 123, ptr undef, align 4
@@ -2823,7 +2823,7 @@ define amdgpu_kernel void @readlane_constant(i32 %arg, i32 %lane) {
define i32 @readlane_idempotent(i32 %arg, i32 %lane) {
; CHECK-LABEL: @readlane_idempotent(
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
+; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
; CHECK-NEXT: ret i32 [[READ0]]
;
%read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane)
@@ -2833,8 +2833,8 @@ define i32 @readlane_idempotent(i32 %arg, i32 %lane) {
define i32 @readlane_idempotent_
diff erent_lanes(i32 %arg, i32 %lane0, i32 %lane1) {
; CHECK-LABEL: @readlane_idempotent_
diff erent_lanes(
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE0:%.*]])
-; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE1:%.*]])
+; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE0:%.*]])
+; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 [[LANE1:%.*]])
; CHECK-NEXT: ret i32 [[READ1]]
;
%read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane0)
@@ -2844,7 +2844,7 @@ define i32 @readlane_idempotent_
diff erent_lanes(i32 %arg, i32 %lane0, i32 %lane1
define i32 @readlane_readfirstlane(i32 %arg) {
; CHECK-LABEL: @readlane_readfirstlane(
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
; CHECK-NEXT: ret i32 [[READ0]]
;
%read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
@@ -2855,10 +2855,10 @@ define i32 @readlane_readfirstlane(i32 %arg) {
define i32 @readlane_idempotent_
diff erent_block(i32 %arg, i32 %lane) {
; CHECK-LABEL: @readlane_idempotent_
diff erent_block(
; CHECK-NEXT: bb0:
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
+; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE]])
+; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 [[LANE]])
; CHECK-NEXT: ret i32 [[READ1]]
;
bb0:
@@ -2874,10 +2874,10 @@ bb1:
define i32 @readlane_readfirstlane_
diff erent_block(i32 %arg) {
; CHECK-LABEL: @readlane_readfirstlane_
diff erent_block(
; CHECK-NEXT: bb0:
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 0)
+; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 0)
; CHECK-NEXT: ret i32 [[READ1]]
;
bb0:
More information about the cfe-commits
mailing list