[clang] [llvm] [AMDGPU] Enable atomic optimizer for 64 bit divergent values (PR #96473)
Vikram Hegde via cfe-commits
cfe-commits at lists.llvm.org
Wed Jun 26 01:07:58 PDT 2024
https://github.com/vikramRH updated https://github.com/llvm/llvm-project/pull/96473
>From b00271123e45c8e1fb66e8ec777eb6bb1ee6b311 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Wed, 10 Apr 2024 11:53:16 +0000
Subject: [PATCH 01/11] [AMDGPU] Extend readlane, writelane and readfirstlane
intrinsic lowering for generic types
---
clang/lib/CodeGen/CGBuiltin.cpp | 4 +
clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 4 +-
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 15 +-
.../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 10 +-
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 3 +
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 4 +
llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 29 +
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 190 +
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 3 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 60 +
llvm/lib/Target/AMDGPU/SIInstructions.td | 2 +-
llvm/lib/Target/AMDGPU/VOP1Instructions.td | 11 +-
llvm/lib/Target/AMDGPU/VOP2Instructions.td | 16 +-
.../UniformityAnalysis/AMDGPU/intrinsics.ll | 6 +-
.../atomic_optimizations_mul_one.ll | 13 +-
.../atomic_optimization_split_dt_update.ll | 2 +-
.../test/CodeGen/AMDGPU/global-atomic-scan.ll | 48 +-
.../AMDGPU/global_atomic_optimizer_fp_rtn.ll | 120 +-
.../AMDGPU/global_atomics_iterative_scan.ll | 2 +-
.../global_atomics_iterative_scan_fp.ll | 8 +-
.../global_atomics_optimizer_fp_no_rtn.ll | 24 +-
.../AMDGPU/llvm.amdgcn.readfirstlane.ll | 654 +++-
.../AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll | 92 +
.../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 960 +++++-
.../AMDGPU/llvm.amdgcn.readlane.ptr.ll | 105 +
.../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 3041 ++++++++++++++++-
.../AMDGPU/llvm.amdgcn.writelane.ptr.ll | 292 ++
.../InstCombine/AMDGPU/amdgcn-intrinsics.ll | 32 +-
28 files changed, 5485 insertions(+), 265 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index f9ee93049b12d..a275ada586d13 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18479,6 +18479,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
return Builder.CreateCall(F, Args);
}
+ case AMDGPU::BI__builtin_amdgcn_readlane:
+ return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_readlane);
+ case AMDGPU::BI__builtin_amdgcn_readfirstlane:
+ return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_readfirstlane);
case AMDGPU::BI__builtin_amdgcn_div_fixup:
case AMDGPU::BI__builtin_amdgcn_div_fixupf:
case AMDGPU::BI__builtin_amdgcn_div_fixuph:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index c2ef9ea947e93..83b75f9e2d618 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -306,14 +306,14 @@ void test_ds_bpermute(global int* out, int a, int b)
}
// CHECK-LABEL: @test_readfirstlane
-// CHECK: call i32 @llvm.amdgcn.readfirstlane(i32 %a)
+// CHECK: call i32 @llvm.amdgcn.readfirstlane.i32(i32 %a)
void test_readfirstlane(global int* out, int a)
{
*out = __builtin_amdgcn_readfirstlane(a);
}
// CHECK-LABEL: @test_readlane
-// CHECK: call i32 @llvm.amdgcn.readlane(i32 %a, i32 %b)
+// CHECK: call i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b)
void test_readlane(global int* out, int a, int b)
{
*out = __builtin_amdgcn_readlane(a, b);
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index be8048ca2459c..457566944069e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2176,26 +2176,23 @@ def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
def int_amdgcn_readfirstlane :
- ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
- Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
+ Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
// The lane argument must be uniform across the currently active threads of the
// current wave. Otherwise, the result is undefined.
def int_amdgcn_readlane :
- ClangBuiltin<"__builtin_amdgcn_readlane">,
- Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+ Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
// The value to write and lane select arguments must be uniform across the
// currently active threads of the current wave. Otherwise, the result is
// undefined.
def int_amdgcn_writelane :
- ClangBuiltin<"__builtin_amdgcn_writelane">,
- Intrinsic<[llvm_i32_ty], [
- llvm_i32_ty, // uniform value to write: returned by the selected lane
- llvm_i32_ty, // uniform lane select
- llvm_i32_ty // returned by all lanes other than the selected one
+ Intrinsic<[llvm_any_ty], [
+ LLVMMatchType<0>, // uniform value to write: returned by the selected lane
+ llvm_i32_ty, // uniform lane select
+ LLVMMatchType<0> // returned by all lanes other than the selected one
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 1d645002b1fe6..9ac0a52f63fd8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -433,7 +433,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
// combine them with a scalar operation.
Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
V = B.CreateBitCast(V, IntNTy);
Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
@@ -523,10 +523,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
B.getInt32(0xf), B.getFalse()});
} else {
- Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
- Function *WriteLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+ Function *ReadLane = Intrinsic::getDeclaration(
+ M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
+ Function *WriteLane = Intrinsic::getDeclaration(
+ M, Intrinsic::amdgcn_writelane, B.getInt32Ty());
// On GFX10 all DPP operations are confined to a single row. To get cross-
// row operations we have to use permlane or readlane.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index d35a022ad6806..9cd7496ba9f96 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5496,6 +5496,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(LDS)
NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
+ NODE_NAME_CASE(READLANE)
+ NODE_NAME_CASE(READFIRSTLANE)
+ NODE_NAME_CASE(WRITELANE)
NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
NODE_NAME_CASE(LOAD_D16_HI)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 3814b56a4d56a..02c3dcf39e554 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -558,6 +558,10 @@ enum NodeType : unsigned {
FPTRUNC_ROUND_UPWARD,
FPTRUNC_ROUND_DOWNWARD,
+ READLANE,
+ READFIRSTLANE,
+ WRITELANE,
+
DUMMY_CHAIN,
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
LOAD_D16_HI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 702f6e67c5527..e4f329b200c86 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -342,6 +342,22 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",
def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
+def AMDGPUReadfirstlaneOp : SDTypeProfile<1, 1, [
+ SDTCisSameAs<0, 1>
+]>;
+
+def AMDGPUReadlaneOp : SDTypeProfile<1, 2, [
+ SDTCisSameAs<0, 1>, SDTCisInt<2>
+]>;
+
+def AMDGPUDWritelaneOp : SDTypeProfile<1, 3, [
+ SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisSameAs<0, 3>
+]>;
+
+def AMDGPUreadlane_impl : SDNode<"AMDGPUISD::READLANE", AMDGPUReadlaneOp>;
+def AMDGPUreadfirstlane_impl : SDNode<"AMDGPUISD::READFIRSTLANE", AMDGPUReadfirstlaneOp>;
+def AMDGPUwritelane_impl : SDNode<"AMDGPUISD::WRITELANE", AMDGPUDWritelaneOp>;
+
// SI+ export
def AMDGPUExportOp : SDTypeProfile<0, 8, [
SDTCisInt<0>, // i8 tgt
@@ -506,3 +522,16 @@ def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$vcc
def AMDGPUperm : PatFrags<(ops node:$src0, node:$src1, node:$src2),
[(int_amdgcn_perm node:$src0, node:$src1, node:$src2),
(AMDGPUperm_impl node:$src0, node:$src1, node:$src2)]>;
+
+def AMDGPUreadlane : PatFrags<(ops node:$src0, node:$src1),
+ [(int_amdgcn_readlane node:$src0, node:$src1),
+ (AMDGPUreadlane_impl node:$src0, node:$src1)]>;
+
+def AMDGPUreadfirstlane : PatFrags<(ops node:$src),
+ [(int_amdgcn_readfirstlane node:$src),
+ (AMDGPUreadfirstlane_impl node:$src)]>;
+
+def AMDGPUwritelane : PatFrags<(ops node:$src0, node:$src1, node:$src2),
+ [(int_amdgcn_writelane node:$src0, node:$src1, node:$src2),
+ (AMDGPUwritelane_impl node:$src0, node:$src1, node:$src2)]>;
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index bd7bf78c4c0bd..6ffc8a20f76fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5387,6 +5387,192 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
return true;
}
+bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
+ MachineInstr &MI,
+ Intrinsic::ID IID) const {
+
+ MachineIRBuilder &B = Helper.MIRBuilder;
+ MachineRegisterInfo &MRI = *B.getMRI();
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register Src0 = MI.getOperand(2).getReg();
+
+ auto createLaneOp = [&](Register Src0, Register Src1,
+ Register Src2) -> Register {
+ auto LaneOp = B.buildIntrinsic(IID, {S32}).addUse(Src0);
+ switch (IID) {
+ case Intrinsic::amdgcn_readfirstlane:
+ return LaneOp.getReg(0);
+ case Intrinsic::amdgcn_readlane:
+ return LaneOp.addUse(Src1).getReg(0);
+ case Intrinsic::amdgcn_writelane:
+ return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
+ default:
+ llvm_unreachable("unhandled lane op");
+ }
+ };
+
+ Register Src1, Src2;
+ if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
+ Src1 = MI.getOperand(3).getReg();
+ if (IID == Intrinsic::amdgcn_writelane) {
+ Src2 = MI.getOperand(4).getReg();
+ }
+ }
+
+ LLT Ty = MRI.getType(DstReg);
+ unsigned Size = Ty.getSizeInBits();
+
+ if (Size == 32) {
+ // Already legal
+ return true;
+ }
+
+ if (Size < 32) {
+ Register Src0Cast = MRI.getType(Src0).isScalar()
+ ? Src0
+ : B.buildBitcast(LLT::scalar(Size), Src0).getReg(0);
+ Src0 = B.buildAnyExt(S32, Src0Cast).getReg(0);
+ if (Src2.isValid()) {
+ Register Src2Cast =
+ MRI.getType(Src2).isScalar()
+ ? Src2
+ : B.buildBitcast(LLT::scalar(Size), Src2).getReg(0);
+ Src2 = B.buildAnyExt(LLT::scalar(32), Src2Cast).getReg(0);
+ }
+
+ Register LaneOpDst = createLaneOp(Src0, Src1, Src2);
+ if (Ty.isScalar())
+ B.buildTrunc(DstReg, LaneOpDst);
+ else {
+ auto Trunc = B.buildTrunc(LLT::scalar(Size), LaneOpDst);
+ B.buildBitcast(DstReg, Trunc);
+ }
+
+ MI.eraseFromParent();
+ return true;
+ }
+
+ if ((Size % 32) == 0) {
+ SmallVector<Register, 2> PartialRes;
+ unsigned NumParts = Size / 32;
+ auto IsS16Vec = Ty.isVector() && Ty.getElementType() == S16;
+ MachineInstrBuilder Src0Parts;
+
+ if (Ty.isPointer()) {
+ auto PtrToInt = B.buildPtrToInt(LLT::scalar(Size), Src0);
+ Src0Parts = B.buildUnmerge(S32, PtrToInt);
+ } else if (Ty.isPointerVector()) {
+ LLT IntVecTy = Ty.changeElementType(
+ LLT::scalar(Ty.getElementType().getSizeInBits()));
+ auto PtrToInt = B.buildPtrToInt(IntVecTy, Src0);
+ Src0Parts = B.buildUnmerge(S32, PtrToInt);
+ } else
+ Src0Parts =
+ IsS16Vec ? B.buildUnmerge(V2S16, Src0) : B.buildUnmerge(S32, Src0);
+
+ switch (IID) {
+ case Intrinsic::amdgcn_readlane: {
+ Register Src1 = MI.getOperand(3).getReg();
+ for (unsigned i = 0; i < NumParts; ++i) {
+ Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
+ : Src0Parts.getReg(i);
+ PartialRes.push_back(
+ (B.buildIntrinsic(Intrinsic::amdgcn_readlane, {S32})
+ .addUse(Src0)
+ .addUse(Src1))
+ .getReg(0));
+ }
+ break;
+ }
+ case Intrinsic::amdgcn_readfirstlane: {
+ for (unsigned i = 0; i < NumParts; ++i) {
+ Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
+ : Src0Parts.getReg(i);
+ PartialRes.push_back(
+ (B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, {S32})
+ .addUse(Src0)
+ .getReg(0)));
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_writelane: {
+ Register Src1 = MI.getOperand(3).getReg();
+ Register Src2 = MI.getOperand(4).getReg();
+ MachineInstrBuilder Src2Parts;
+
+ if (Ty.isPointer()) {
+ auto PtrToInt = B.buildPtrToInt(S64, Src2);
+ Src2Parts = B.buildUnmerge(S32, PtrToInt);
+ } else if (Ty.isPointerVector()) {
+ LLT IntVecTy = Ty.changeElementType(
+ LLT::scalar(Ty.getElementType().getSizeInBits()));
+ auto PtrToInt = B.buildPtrToInt(IntVecTy, Src2);
+ Src2Parts = B.buildUnmerge(S32, PtrToInt);
+ } else
+ Src2Parts =
+ IsS16Vec ? B.buildUnmerge(V2S16, Src2) : B.buildUnmerge(S32, Src2);
+
+ for (unsigned i = 0; i < NumParts; ++i) {
+ Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
+ : Src0Parts.getReg(i);
+ Src2 = IsS16Vec ? B.buildBitcast(S32, Src2Parts.getReg(i)).getReg(0)
+ : Src2Parts.getReg(i);
+ PartialRes.push_back(
+ (B.buildIntrinsic(Intrinsic::amdgcn_writelane, {S32})
+ .addUse(Src0)
+ .addUse(Src1)
+ .addUse(Src2))
+ .getReg(0));
+ }
+
+ break;
+ }
+ }
+
+ if (Ty.isPointerVector()) {
+ unsigned PtrSize = Ty.getElementType().getSizeInBits();
+ SmallVector<Register, 2> PtrElements;
+ if (PtrSize == 32) {
+ // Handle 32 bit pointers
+ for (unsigned i = 0; i < NumParts; i++)
+ PtrElements.push_back(
+ B.buildIntToPtr(Ty.getElementType(), PartialRes[i]).getReg(0));
+ } else {
+ // Handle legalization of <? x [pointer type bigger than 32 bits]>
+ SmallVector<Register, 2> PtrParts;
+ unsigned NumS32Parts = PtrSize / 32;
+ unsigned PartIdx = 0;
+ for (unsigned i = 0, j = 1; i < NumParts; i += NumS32Parts, j++) {
+ // Merge S32 components of a pointer element first.
+ for (; PartIdx < (j * NumS32Parts); PartIdx++)
+ PtrParts.push_back(PartialRes[PartIdx]);
+
+ auto MergedPtr =
+ B.buildMergeLikeInstr(LLT::scalar(PtrSize), PtrParts);
+ PtrElements.push_back(
+ B.buildIntToPtr(Ty.getElementType(), MergedPtr).getReg(0));
+ PtrParts.clear();
+ }
+ }
+
+ B.buildMergeLikeInstr(DstReg, PtrElements);
+ } else {
+ if (IsS16Vec) {
+ for (unsigned i = 0; i < NumParts; i++)
+ PartialRes[i] = B.buildBitcast(V2S16, PartialRes[i]).getReg(0);
+ }
+ B.buildMergeLikeInstr(DstReg, PartialRes);
+ }
+
+ MI.eraseFromParent();
+ return true;
+ }
+
+ return false;
+}
+
bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
@@ -7330,6 +7516,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
Observer.changedInstr(MI);
return true;
}
+ case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_writelane:
+ case Intrinsic::amdgcn_readfirstlane:
+ return legalizeLaneOp(Helper, MI, IntrID);
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index e5ba84a74a0f8..40e056154527f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -208,6 +208,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
Intrinsic::ID IID) const;
+ bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI,
+ Intrinsic::ID IID) const;
+
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0a3a56e9b3a0b..6c23bdf09974b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6086,6 +6086,62 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
}
+static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
+ SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ unsigned ValSize = VT.getSizeInBits();
+ unsigned IntrinsicID = N->getConstantOperandVal(0);
+ SDValue Src0 = N->getOperand(1);
+ SDLoc SL(N);
+ MVT IntVT = MVT::getIntegerVT(ValSize);
+
+ auto createLaneOp = [&DAG, &SL](SDValue Src0, SDValue Src1, SDValue Src2,
+ MVT VT) -> SDValue {
+ return (Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, VT, {Src0, Src1, Src2})
+ : Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, VT, {Src0, Src1})
+ : DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, VT, {Src0}));
+ };
+
+ SDValue Src1, Src2;
+ if (IntrinsicID == Intrinsic::amdgcn_readlane ||
+ IntrinsicID == Intrinsic::amdgcn_writelane) {
+ Src1 = N->getOperand(2);
+ if (IntrinsicID == Intrinsic::amdgcn_writelane)
+ Src2 = N->getOperand(3);
+ }
+
+ if (ValSize == 32) {
+ // Already legal
+ return SDValue();
+ }
+
+ if (ValSize < 32) {
+ SDValue InitBitCast = DAG.getBitcast(IntVT, Src0);
+ Src0 = DAG.getAnyExtOrTrunc(InitBitCast, SL, MVT::i32);
+ if (Src2.getNode()) {
+ SDValue Src2Cast = DAG.getBitcast(IntVT, Src2);
+ Src2 = DAG.getAnyExtOrTrunc(Src2Cast, SL, MVT::i32);
+ }
+ SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
+ SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
+ return DAG.getBitcast(VT, Trunc);
+ }
+
+ if ((ValSize % 32) == 0) {
+ MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
+ Src0 = DAG.getBitcast(VecVT, Src0);
+
+ if (Src2.getNode())
+ Src2 = DAG.getBitcast(VecVT, Src2);
+
+ SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
+ SDValue UnrolledLaneOp = DAG.UnrollVectorOp(LaneOp.getNode());
+ return DAG.getBitcast(VT, UnrolledLaneOp);
+ }
+
+ return SDValue();
+}
+
void SITargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
@@ -8553,6 +8609,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
case Intrinsic::amdgcn_addrspacecast_nonnull:
return lowerADDRSPACECAST(Op, DAG);
+ case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_writelane:
+ return lowerLaneOp(*this, Op.getNode(), DAG);
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index f9e811f54d05e..628fa438f696a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3400,7 +3400,7 @@ def : GCNPat<
// FIXME: Should also do this for readlane, but tablegen crashes on
// the ignored src1.
def : GCNPat<
- (int_amdgcn_readfirstlane (i32 imm:$src)),
+ (i32 (AMDGPUreadfirstlane (i32 imm:$src))),
(S_MOV_B32 SReg_32:$src)
>;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 4a56fad0cd603..9c5d6a7bf6d0b 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -112,7 +112,7 @@ class getVOP1Pat <SDPatternOperator node, VOPProfile P> : LetDummies {
!if(P.HasOMod,
[(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0,
i1:$clamp, i32:$omod))))],
- [(set P.DstVT:$vdst, (node P.Src0RC32:$src0))]
+ [(set P.DstVT:$vdst, (node (P.Src0VT P.Src0RC32:$src0)))]
)
);
}
@@ -243,11 +243,16 @@ def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
// FIXME: Specify SchedRW for READFIRSTLANE_B32
// TODO: There is VOP3 encoding also
def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE,
- getVOP1Pat<int_amdgcn_readfirstlane,
- VOP_READFIRSTLANE>.ret, 1> {
+ [], 1> {
let isConvergent = 1;
}
+foreach vt = Reg32Types.types in {
+ def : GCNPat<(vt (AMDGPUreadfirstlane (vt VRegOrLdsSrc_32:$src0))),
+ (V_READFIRSTLANE_B32 (vt VRegOrLdsSrc_32:$src0))
+ >;
+}
+
let isReMaterializable = 1 in {
let SchedRW = [WriteDoubleCvt] in {
// OMod clears exceptions when set in this instruction
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index d2af1753d5503..b1df57320cfdd 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -780,14 +780,22 @@ defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag,
// These are special and do not read the exec mask.
let isConvergent = 1, Uses = []<Register> in {
-def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
- [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))]>;
+def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,[]>;
let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
-def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
- [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]>;
+def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []>;
} // End IsNeverUniform, $vdst = $vdst_in, DisableEncoding $vdst_in
} // End isConvergent = 1
+foreach vt = Reg32Types.types in {
+ def : GCNPat<(vt (AMDGPUreadlane vt:$src0, i32:$src1)),
+ (V_READLANE_B32 VRegOrLdsSrc_32:$src0, SCSrc_b32:$src1)
+ >;
+
+ def : GCNPat<(vt (AMDGPUwritelane vt:$src0, i32:$src1, vt:$src2)),
+ (V_WRITELANE_B32 SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$src2)
+ >;
+}
+
let isReMaterializable = 1 in {
defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>;
defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32, add_ctpop>;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 26c85e83b53ad..74d2f53d7b317 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -56,9 +56,9 @@ define amdgpu_kernel void @mov_dpp8(ptr addrspace(1) %out, i32 %in) #0 {
ret void
}
-; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.writelane(i32 0, i32 1, i32 2)
+; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.writelane.i32(i32 0, i32 1, i32 2)
define amdgpu_kernel void @writelane(ptr addrspace(1) %out) #0 {
- %tmp0 = call i32 @llvm.amdgcn.writelane(i32 0, i32 1, i32 2)
+ %tmp0 = call i32 @llvm.amdgcn.writelane.i32(i32 0, i32 1, i32 2)
store i32 %tmp0, ptr addrspace(1) %out
ret void
}
@@ -237,7 +237,7 @@ declare i32 @llvm.amdgcn.permlanex16.var(i32, i32, i32, i1, i1) #1
declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1
declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1
-declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #1
+declare i32 @llvm.amdgcn.writelane.i32(i32, i32, i32) #1
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>) #1
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>) #1
declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
index 220dc70165e87..bdfafa89cd047 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
@@ -1,5 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
@@ -74,7 +73,7 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
; IR-NEXT: br label [[TMP11]]
; IR: 11:
; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
; IR-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]]
; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0)
; IR-NEXT: ret void
@@ -172,7 +171,7 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
; IR-NEXT: br label [[TMP11]]
; IR: 11:
; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
; IR-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]]
; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0)
; IR-NEXT: ret void
@@ -273,7 +272,7 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = and i32 [[TMP5]], 1
; IR-NEXT: [[TMP16:%.*]] = xor i32 [[TMP14]], [[TMP15]]
; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP16]], i32 0, i32 0, i32 0)
@@ -374,7 +373,7 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
; IR-NEXT: br label [[TMP11]]
; IR: 11:
; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
; IR-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]]
; IR-NEXT: [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128
; IR-NEXT: [[ARG_VEC:%.*]] = bitcast i128 [[ARG_INT]] to <4 x i32>
@@ -476,7 +475,7 @@ define amdgpu_cs void @atomic_ptr_sub_and_format(ptr addrspace(8) inreg %arg) {
; IR-NEXT: br label [[TMP11]]
; IR: 11:
; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
; IR-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]]
; IR-NEXT: [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128
; IR-NEXT: [[ARG_VEC:%.*]] = bitcast i128 [[ARG_INT]] to <4 x i32>
@@ -581,7 +580,7 @@ define amdgpu_cs void @atomic_ptr_xor_and_format(ptr addrspace(8) inreg %arg) {
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = and i32 [[TMP5]], 1
; IR-NEXT: [[TMP16:%.*]] = xor i32 [[TMP14]], [[TMP15]]
; IR-NEXT: [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll
index c07cd4e493b9a..019f76aa44a87 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll
@@ -48,7 +48,7 @@ define amdgpu_kernel void @ham(ptr addrspace(4) %arg) {
; CHECK-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP6]], [[BB7]] ], [ [[TMP16:%.*]], [[COMPUTELOOP]] ]
; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
-; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[PHI]], i32 [[TMP11]])
+; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[PHI]], i32 [[TMP11]])
; CHECK-NEXT: [[TMP13]] = add i32 [[ACCUMULATOR]], [[TMP12]]
; CHECK-NEXT: [[TMP14:%.*]] = shl i64 1, [[TMP10]]
; CHECK-NEXT: [[TMP15:%.*]] = xor i64 [[TMP14]], -1
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll
index 6b47f81bccb71..6c61c837881c4 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll
@@ -130,7 +130,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -193,7 +193,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -251,7 +251,7 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspa
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -310,7 +310,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -364,7 +364,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -421,7 +421,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -473,7 +473,7 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspa
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -526,7 +526,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -586,7 +586,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -649,7 +649,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -707,7 +707,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspa
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -766,7 +766,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -820,7 +820,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -878,7 +878,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -931,7 +931,7 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspa
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -985,7 +985,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1040,7 +1040,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1098,7 +1098,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1151,7 +1151,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrsp
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1205,7 +1205,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1260,7 +1260,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1318,7 +1318,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1371,7 +1371,7 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspa
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1425,7 +1425,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
index b71728096093c..baaf50377338c 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
@@ -29,7 +29,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_agent_scope_uns
; IR: 16:
; IR-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
; IR-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]])
+; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]])
; IR-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
; IR-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float
; IR-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]]
@@ -62,7 +62,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]])
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = fadd float [[TMP16]], [[TMP28:%.*]]
; IR-ITERATIVE-NEXT: br label [[TMP18]]
@@ -76,11 +76,11 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]])
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]])
; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]])
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]])
; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float
; IR-ITERATIVE-NEXT: [[TMP29]] = fadd float [[ACCUMULATOR]], [[TMP24]]
; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -120,7 +120,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco
; IR-DPP-NEXT: [[TMP24:%.*]] = fadd float [[TMP22]], [[TMP23]]
; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false)
; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63)
+; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63)
; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]])
; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -131,7 +131,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco
; IR-DPP: 33:
; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]])
+; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]])
; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]])
; IR-DPP-NEXT: [[TMP39:%.*]] = fadd float [[TMP37]], [[TMP38]]
@@ -167,7 +167,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_un
; IR-ITERATIVE: 16:
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
@@ -199,7 +199,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_un
; IR-DPP: 16:
; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -232,7 +232,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: br label [[TMP18]]
@@ -246,11 +246,11 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float
; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -290,7 +290,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -301,7 +301,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un
; IR-DPP: 33:
; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -337,7 +337,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str
; IR-ITERATIVE: 16:
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
@@ -369,7 +369,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str
; IR-DPP: 16:
; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -402,7 +402,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: br label [[TMP18]]
@@ -416,11 +416,11 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float
; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -460,7 +460,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -471,7 +471,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
; IR-DPP: 33:
; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -503,7 +503,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_uni_value_agent_scope_uns
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
; IR-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
+; IR-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]])
; IR-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
; IR-NEXT: [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float
; IR-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]]
@@ -536,7 +536,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]])
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP28:%.*]])
; IR-ITERATIVE-NEXT: br label [[TMP18]]
@@ -550,11 +550,11 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]])
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]])
; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]])
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]])
; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float
; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.minnum.f32(float [[ACCUMULATOR]], float [[TMP24]])
; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -594,7 +594,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.minnum.f32(float [[TMP22]], float [[TMP23]])
; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP24]], i32 312, i32 15, i32 15, i1 false)
; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63)
+; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63)
; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]])
; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -605,7 +605,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns
; IR-DPP: 33:
; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]])
+; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]])
; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]])
; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.minnum.f32(float [[TMP37]], float [[TMP38]])
@@ -637,7 +637,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_uns
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[VAL]]
@@ -665,7 +665,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_uns
; IR-DPP: 12:
; IR-DPP-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-DPP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[VAL]]
@@ -698,7 +698,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: br label [[TMP18]]
@@ -712,11 +712,11 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float
; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.maxnum.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -756,7 +756,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP22]], float [[TMP23]], metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -767,7 +767,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns
; IR-DPP: 33:
; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP37]], float [[TMP38]], metadata !"fpexcept.strict") #[[ATTR8]]
@@ -803,7 +803,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_st
; IR-ITERATIVE: 16:
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
@@ -835,7 +835,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_st
; IR-DPP: 16:
; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -868,7 +868,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: br label [[TMP18]]
@@ -882,11 +882,11 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float
; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -926,7 +926,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -937,7 +937,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
; IR-DPP: 33:
; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -1084,8 +1084,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_agent_s
; IR-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
; IR-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
; IR-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]])
-; IR-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]])
+; IR-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]])
+; IR-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]])
; IR-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
; IR-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
; IR-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1136,8 +1136,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_
; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1174,8 +1174,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_
; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1226,8 +1226,8 @@ define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_s
; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1264,8 +1264,8 @@ define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_s
; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1312,8 +1312,8 @@ define amdgpu_ps double @global_atomic_fmin_double_uni_address_uni_value_agent_s
; IR-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
; IR-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32
; IR-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
-; IR-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]])
-; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]])
+; IR-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]])
+; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]])
; IR-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
; IR-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1
; IR-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double
@@ -1360,8 +1360,8 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
-; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1
; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double
@@ -1394,8 +1394,8 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_
; IR-DPP-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
; IR-DPP-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32
; IR-DPP-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
-; IR-DPP-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
; IR-DPP-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1
; IR-DPP-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double
@@ -1446,8 +1446,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_
; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1484,8 +1484,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_
; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll
index f954560d0f5ca..4b4c99b3cd14c 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll
@@ -83,7 +83,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, ptr addrspace(
; IR-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP6]], [[ENTRY]] ], [ [[TMP16:%.*]], [[COMPUTELOOP]] ]
; IR-NEXT: [[TMP10:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; IR-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
-; IR-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[VALUE]], i32 [[TMP11]])
+; IR-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[VALUE]], i32 [[TMP11]])
; IR-NEXT: [[TMP13]] = add i32 [[ACCUMULATOR]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = shl i64 1, [[TMP10]]
; IR-NEXT: [[TMP15:%.*]] = xor i64 [[TMP14]], -1
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
index 86e3d9338e078..38823681d1bb5 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
@@ -69,7 +69,7 @@ define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) #
; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32
-; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]])
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP13]], i32 [[TMP12]])
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float
; IR-ITERATIVE-NEXT: [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]]
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]]
@@ -107,7 +107,7 @@ define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) #
; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 323, i32 12, i32 15, i1 false)
; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP20]], [[TMP21]]
; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
-; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP23]], i32 63)
+; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP23]], i32 63)
; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float
; IR-DPP-NEXT: [[TMP26:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]])
; IR-DPP-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP6]], 0
@@ -191,7 +191,7 @@ define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) #
; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32
-; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]])
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP13]], i32 [[TMP12]])
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float
; IR-ITERATIVE-NEXT: [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]]
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]]
@@ -229,7 +229,7 @@ define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) #
; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 323, i32 12, i32 15, i1 false)
; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP20]], [[TMP21]]
; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
-; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP23]], i32 63)
+; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP23]], i32 63)
; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float
; IR-DPP-NEXT: [[TMP26:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]])
; IR-DPP-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP6]], 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
index b9234f47df192..83453354320fe 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
@@ -61,7 +61,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_scope_agent_scop
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]])
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]])
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
; IR-ITERATIVE-NEXT: [[TMP19]] = fadd float [[ACCUMULATOR]], [[TMP18]]
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -100,7 +100,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_scope_agent_scop
; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false)
; IR-DPP-NEXT: [[TMP24:%.*]] = fadd float [[TMP22]], [[TMP23]]
; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63)
+; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63)
; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]])
; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -196,7 +196,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_one_as_scope_uns
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -235,7 +235,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_one_as_scope_uns
; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -331,7 +331,7 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -370,7 +370,7 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri
; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -438,7 +438,7 @@ define amdgpu_ps void @global_atomic_fmin_uni_address_div_value_agent_scope_unsa
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]])
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]])
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.minnum.f32(float [[ACCUMULATOR]], float [[TMP18]])
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -477,7 +477,7 @@ define amdgpu_ps void @global_atomic_fmin_uni_address_div_value_agent_scope_unsa
; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP22]], i32 323, i32 12, i32 15, i1 false)
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.minnum.f32(float [[TMP22]], float [[TMP23]])
; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63)
+; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63)
; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]])
; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -565,7 +565,7 @@ define amdgpu_ps void @global_atomic_fmax_uni_address_div_value_agent_scope_unsa
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.maxnum.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -604,7 +604,7 @@ define amdgpu_ps void @global_atomic_fmax_uni_address_div_value_agent_scope_unsa
; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP22]], float [[TMP23]], metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -700,7 +700,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_str
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -739,7 +739,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_str
; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 0284f44f5f14d..732489f22c36f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -1,65 +1,387 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s
declare i32 @llvm.amdgcn.readfirstlane(i32) #0
+declare i64 @llvm.amdgcn.readfirstlane.i64(i64) #0
+declare double @llvm.amdgcn.readfirstlane.f64(double) #0
-; CHECK-LABEL: {{^}}test_readfirstlane:
-; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
-define void @test_readfirstlane(ptr addrspace(1) %out, i32 %src) #1 {
- %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src)
+define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %src)
store i32 %readfirstlane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_readfirstlane_imm:
-; CHECK: s_mov_b32 [[SGPR_VAL:s[0-9]]], 32
-; CHECK-NOT: [[SGPR_VAL]]
-; CHECK: ; use [[SGPR_VAL]]
-define amdgpu_kernel void @test_readfirstlane_imm(ptr addrspace(1) %out) #1 {
- %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
+define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s5
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s4
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %src)
+ store i64 %readfirstlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s5
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s4
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double %src)
+ store double %readfirstlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_readfirstlane_imm_i32(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_mov_b32 s0, 32
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_mov_b32 s0, 32
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_endpgm
+ %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32)
call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
ret void
}
-; CHECK-LABEL: {{^}}test_readfirstlane_imm_fold:
-; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32
-; CHECK-NOT: [[VVAL]]
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
-define amdgpu_kernel void @test_readfirstlane_imm_fold(ptr addrspace(1) %out) #1 {
- %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
+define amdgpu_kernel void @test_readfirstlane_imm_i64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_mov_b64 s[0:1], 32
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[0:1]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_mov_b64 s[0:1], 32
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[0:1]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_endpgm
+ %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 32)
+ call void asm sideeffect "; use $0", "s"(i64 %readfirstlane)
+ ret void
+}
+
+define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_mov_b32 s0, 0
+; CHECK-SDAG-NEXT: s_mov_b32 s1, 0x40400000
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[0:1]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_mov_b32 s0, 0
+; CHECK-GISEL-NEXT: s_mov_b32 s1, 0x40400000
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[0:1]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_endpgm
+ %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double 32.0)
+ call void asm sideeffect "; use $0", "s"(double %readfirstlane)
+ ret void
+}
+
+define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT: s_endpgm
+ %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32)
store i32 %readfirstlane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_readfirstlane_m0:
-; CHECK: s_mov_b32 m0, -1
-; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
+define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 32)
+ store i64 %readfirstlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double 32.0)
+ store double %readfirstlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_m0:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: s_mov_b32 m0, -1
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_m0:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: s_mov_b32 m0, -1
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT: s_endpgm
%m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
%readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0)
store i32 %readfirstlane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_readfirstlane_copy_from_sgpr:
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: s_mov_b32 [[SGPR:s[0-9]+]]
-; CHECK: ;;#ASMEND
-; CHECK-NOT: [[SGPR]]
-; CHECK-NOT: readfirstlane
-; CHECK: v_mov_b32_e32 [[VCOPY:v[0-9]+]], [[SGPR]]
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]]
-define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: s_mov_b32 s2, 0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT: s_endpgm
%sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
- %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %sgpr)
+ %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %sgpr)
store i32 %readfirstlane, ptr addrspace(1) %out, align 4
ret void
}
-; Make sure this doesn't crash.
-; CHECK-LABEL: {{^}}test_readfirstlane_fi:
-; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 0
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %sgpr = call i64 asm "s_mov_b64 $0, 0", "=s"()
+ %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %sgpr)
+ store i64 %readfirstlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %sgpr = call double asm "s_mov_b64 $0, 0", "=s"()
+ %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double %sgpr)
+ store double %readfirstlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_fi:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_add_u32 s0, s0, s9
+; CHECK-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-SDAG-NEXT: s_mov_b32 s4, 0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_fi:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_add_u32 s0, s0, s9
+; CHECK-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-GISEL-NEXT: s_mov_b32 s4, 0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_endpgm
%alloca = alloca i32, addrspace(5)
%int = ptrtoint ptr addrspace(5) %alloca to i32
%readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %int)
@@ -67,5 +389,269 @@ define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 {
ret void
}
+define void @test_readfirstlane_half(ptr addrspace(1) %out, half %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_half:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_half:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call half @llvm.amdgcn.readfirstlane.f16(half %src)
+ call void asm sideeffect "; use $0", "s"(half %x)
+ ret void
+}
+
+define void @test_readfirstlane_float(ptr addrspace(1) %out, float %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_float:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_float:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call float @llvm.amdgcn.readfirstlane.f32(float %src)
+ call void asm sideeffect "; use $0", "s"(float %x)
+ ret void
+}
+
+define void @test_readfirstlane_bfloat(ptr addrspace(1) %out, bfloat %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_bfloat:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_bfloat:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call bfloat @llvm.amdgcn.readfirstlane.bf16(bfloat %src)
+ call void asm sideeffect "; use $0", "s"(bfloat %x)
+ ret void
+}
+
+define void @test_readfirstlane_i16(ptr addrspace(1) %out, i16 %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_i16:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0xffff
+; CHECK-SDAG-NEXT: v_and_b32_e32 v0, s4, v0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use v0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_i16:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call i16 @llvm.amdgcn.readfirstlane.i16(i16 %src)
+ call void asm sideeffect "; use $0", "s"(i16 %x)
+ ret void
+}
+
+define void @test_readfirstlane_v2f16(ptr addrspace(1) %out, <2 x half> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v2f16:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v2f16:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> %src)
+ call void asm sideeffect "; use $0", "s"(<2 x half> %x)
+ ret void
+}
+
+define void @test_readfirstlane_v2f32(ptr addrspace(1) %out, <2 x float> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v2f32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:5]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v2f32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:5]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <2 x float> @llvm.amdgcn.readfirstlane.v2f32(<2 x float> %src)
+ call void asm sideeffect "; use $0", "s"(<2 x float> %x)
+ ret void
+}
+
+define void @test_readfirstlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v7i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:10]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v7i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:10]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <7 x i32> @llvm.amdgcn.readfirstlane.v7i32(<7 x i32> %src)
+ call void asm sideeffect "; use $0", "s"(<7 x i32> %x)
+ ret void
+}
+
+define void @test_readfirstlane_p0(ptr addrspace(1) %out, ptr %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_p0:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:5]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_p0:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:5]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call ptr @llvm.amdgcn.readfirstlane.p0(ptr %src)
+ call void asm sideeffect "; use $0", "s"(ptr %x)
+ ret void
+}
+
+define void @test_readfirstlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3p0:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:9]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v3p0:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:9]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <3 x ptr> @llvm.amdgcn.readfirstlane.v3p0(<3 x ptr> %src)
+ call void asm sideeffect "; use $0", "s"(<3 x ptr> %x)
+ ret void
+}
+
+define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v8i16:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:7]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v8i16:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:7]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <8 x i16> @llvm.amdgcn.readfirstlane.v8i16(<8 x i16> %src)
+ call void asm sideeffect "; use $0", "s"(<8 x i16> %x)
+ ret void
+}
+
attributes #0 = { nounwind readnone convergent }
attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
new file mode 100644
index 0000000000000..588f239606f52
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
+
+define void @test_readfirstlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_p3:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call ptr addrspace(3) @llvm.amdgcn.readfirstlane.p3(ptr addrspace(3) %src)
+ call void asm sideeffect "; use $0", "s"(ptr addrspace(3) %x)
+ ret void
+}
+
+define void @test_readfirstlane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3p3:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:6]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call <3 x ptr addrspace(3)> @llvm.amdgcn.readfirstlane.v3p3(<3 x ptr addrspace(3)> %src)
+ call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(3)> %x)
+ ret void
+}
+
+define void @test_readfirstlane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_p5:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call ptr addrspace(5) @llvm.amdgcn.readfirstlane.p5(ptr addrspace(5) %src)
+ call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %x)
+ ret void
+}
+
+define void @test_readfirstlane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3p5:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:6]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call <3 x ptr addrspace(5)> @llvm.amdgcn.readfirstlane.v3p5(<3 x ptr addrspace(5)> %src)
+ call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(5)> %x)
+ ret void
+}
+
+define void @test_readfirstlane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_p6:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call ptr addrspace(6) @llvm.amdgcn.readfirstlane.p6(ptr addrspace(6) %src)
+ call void asm sideeffect "; use $0", "s"(ptr addrspace(6) %x)
+ ret void
+}
+
+define void @test_readfirstlane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3p6:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:6]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call <3 x ptr addrspace(6)> @llvm.amdgcn.readfirstlane.v3p6(<3 x ptr addrspace(6)> %src)
+ call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(6)> %x)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 51465f6bd10ce..71cd3db81addd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -1,82 +1,966 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck --check-prefix=CHECK-GISEL -enable-var-scope %s
-declare i32 @llvm.amdgcn.readlane(i32, i32) #0
+declare i32 @llvm.amdgcn.readlane.i32(i32, i32) #0
+declare i64 @llvm.amdgcn.readlane.i64(i64, i32) #0
+declare double @llvm.amdgcn.readlane.f64(double, i32) #0
-; CHECK-LABEL: {{^}}test_readlane_sreg_sreg:
-; CHECK-NOT: v_readlane_b32
-define amdgpu_kernel void @test_readlane_sreg_sreg(i32 %src0, i32 %src1) #1 {
- %readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
+define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_endpgm
+ %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %src0, i32 %src1)
call void asm sideeffect "; use $0", "s"(i32 %readlane)
ret void
}
-; CHECK-LABEL: {{^}}test_readlane_vreg_sreg:
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_readlane_vreg_sreg(i32 %src0, i32 %src1) #1 {
+define amdgpu_kernel void @test_readlane_sreg_sreg_i64(i64 %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[0:1]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[0:1]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_endpgm
+ %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %src0, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(i64 %readlane)
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_sreg_sreg_f64(double %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[0:1]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[0:1]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_endpgm
+ %readlane = call double @llvm.amdgcn.readlane.f64(double %src0, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(double %readlane)
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dword s0, s[4:5], 0x4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; def v0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dword s0, s[4:5], 0x4
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; def v0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readlane_b32 s0, v0, s0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_endpgm
%vgpr = call i32 asm sideeffect "; def $0", "=v"()
- %readlane = call i32 @llvm.amdgcn.readlane(i32 %vgpr, i32 %src1)
+ %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %vgpr, i32 %src1)
call void asm sideeffect "; use $0", "s"(i32 %readlane)
ret void
}
-; CHECK-LABEL: {{^}}test_readlane_imm_sreg:
-; CHECK-NOT: v_readlane_b32
-define amdgpu_kernel void @test_readlane_imm_sreg(ptr addrspace(1) %out, i32 %src1) #1 {
- %readlane = call i32 @llvm.amdgcn.readlane(i32 32, i32 %src1)
+define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dword s0, s[4:5], 0x8
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; def v[0:1]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readlane_b32 s1, v1, s0
+; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[0:1]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dword s1, s[4:5], 0x8
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; def v[0:1]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readlane_b32 s0, v0, s1
+; CHECK-GISEL-NEXT: v_readlane_b32 s1, v1, s1
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[0:1]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_endpgm
+ %vgpr = call i64 asm sideeffect "; def $0", "=v"()
+ %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %vgpr, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(i64 %readlane)
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_vreg_sreg_f64(double %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dword s0, s[4:5], 0x8
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; def v[0:1]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readlane_b32 s1, v1, s0
+; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[0:1]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dword s1, s[4:5], 0x8
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; def v[0:1]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readlane_b32 s0, v0, s1
+; CHECK-GISEL-NEXT: v_readlane_b32 s1, v1, s1
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[0:1]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_endpgm
+ %vgpr = call double asm sideeffect "; def $0", "=v"()
+ %readlane = call double @llvm.amdgcn.readlane.f64(double %vgpr, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(double %readlane)
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT: s_endpgm
+ %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 32, i32 %src1)
store i32 %readlane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_readlane_vregs:
-; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]]
-define amdgpu_kernel void @test_readlane_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 32, i32 %src1)
+ store i64 %readlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_imm_sreg_f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %readlane = call double @llvm.amdgcn.readlane.f64(double 32.0, i32 %src1)
+ store double %readlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vregs_i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; CHECK-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s0, v1
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: flat_store_dword v[2:3], v0
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vregs_i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s2, v1
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.in = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 %tid
%args = load <2 x i32>, ptr addrspace(1) %gep.in
%value = extractelement <2 x i32> %args, i32 0
%lane = extractelement <2 x i32> %args, i32 1
- %readlane = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %lane)
+ %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %value, i32 %lane)
store i32 %readlane, ptr addrspace(1) %out, align 4
ret void
}
-; TODO: m0 should be folded.
-; CHECK-LABEL: {{^}}test_readlane_m0_sreg:
-; CHECK: s_mov_b32 m0, -1
-; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
+define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vregs_i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; CHECK-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-SDAG-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s1, v1, s0
+; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[3:4], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vregs_i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s3, v2
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s3
+; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, s3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.in = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid
+ %args = load <2 x i64>, ptr addrspace(1) %gep.in
+ %value = extractelement <2 x i64> %args, i32 0
+ %lane = extractelement <2 x i64> %args, i32 1
+ %lane32 = trunc i64 %lane to i32
+ %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %value, i32 %lane32)
+ store i64 %readlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vregs_f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; CHECK-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-SDAG-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s1, v1, s0
+; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[3:4], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vregs_f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s3, v2
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s3
+; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, s3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.in = getelementptr <2 x double>, ptr addrspace(1) %in, i32 %tid
+ %args = load <2 x double>, ptr addrspace(1) %gep.in
+ %value = extractelement <2 x double> %args, i32 0
+ %lane = extractelement <2 x double> %args, i32 1
+ %lane_cast = bitcast double %lane to i64
+ %lane32 = trunc i64 %lane_cast to i32
+ %readlane = call double @llvm.amdgcn.readlane.f64(double %value, i32 %lane32)
+ store double %readlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_m0_sreg:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: s_mov_b32 m0, -1
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_m0_sreg:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: s_mov_b32 m0, -1
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT: s_endpgm
%m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
%readlane = call i32 @llvm.amdgcn.readlane(i32 %m0, i32 %src1)
store i32 %readlane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_readlane_vgpr_imm:
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32
-define amdgpu_kernel void @test_readlane_vgpr_imm(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; def v0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_readlane_b32 s2, v0, 32
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; def v0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT: s_endpgm
%vgpr = call i32 asm sideeffect "; def $0", "=v"()
- %readlane = call i32 @llvm.amdgcn.readlane(i32 %vgpr, i32 32) #0
+ %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %vgpr, i32 32) #0
store i32 %readlane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_readlane_copy_from_sgpr:
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: s_mov_b32 [[SGPR:s[0-9]+]]
-; CHECK: ;;#ASMEND
-; CHECK-NOT: [[SGPR]]
-; CHECK-NOT: readlane
-; CHECK: v_mov_b32_e32 [[VCOPY:v[0-9]+]], [[SGPR]]
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]]
-define amdgpu_kernel void @test_readlane_copy_from_sgpr(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; def v[0:1]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32
+; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; def v[0:1]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %vgpr = call i64 asm sideeffect "; def $0", "=v"()
+ %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %vgpr, i32 32) #0
+ store i64 %readlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; def v[0:1]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32
+; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; def v[0:1]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %vgpr = call double asm sideeffect "; def $0", "=v"()
+ %readlane = call double @llvm.amdgcn.readlane.f64(double %vgpr, i32 32) #0
+ store double %readlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: s_mov_b32 s2, 0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT: s_endpgm
%sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
- %readfirstlane = call i32 @llvm.amdgcn.readlane(i32 %sgpr, i32 7)
+ %readfirstlane = call i32 @llvm.amdgcn.readlane.i32(i32 %sgpr, i32 7)
store i32 %readfirstlane, ptr addrspace(1) %out, align 4
ret void
}
+define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %sgpr = call i64 asm "s_mov_b64 $0, 0", "=s"()
+ %readfirstlane = call i64 @llvm.amdgcn.readlane.i64(i64 %sgpr, i32 7)
+ store i64 %readfirstlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT: s_endpgm
+ %sgpr = call double asm "s_mov_b64 $0, 0", "=s"()
+ %readfirstlane = call double @llvm.amdgcn.readlane.f64(double %sgpr, i32 7)
+ store double %readfirstlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_readlane_half(ptr addrspace(1) %out, half %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_half:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_half:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call half @llvm.amdgcn.readlane.f16(half %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(half %x)
+ ret void
+}
+
+define void @test_readlane_float(ptr addrspace(1) %out, float %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_float:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_float:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call float @llvm.amdgcn.readlane.f32(float %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(float %x)
+ ret void
+}
+
+define void @test_readlane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_bfloat:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_bfloat:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call bfloat @llvm.amdgcn.readlane.bf16(bfloat %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(bfloat %x)
+ ret void
+}
+
+define void @test_readlane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_i16:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0xffff
+; CHECK-SDAG-NEXT: s_nop 2
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: v_and_b32_e32 v0, s4, v0
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use v0
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_i16:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call i16 @llvm.amdgcn.readlane.i16(i16 %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(i16 %x)
+ ret void
+}
+
+define void @test_readlane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v2f16:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v2f16:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <2 x half> @llvm.amdgcn.readlane.v2f16(<2 x half> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<2 x half> %x)
+ ret void
+}
+
+define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v2f32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:5]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v2f32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v4
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s5
+; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s5
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:5]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <2 x float> @llvm.amdgcn.readlane.v2f32(<2 x float> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<2 x float> %x)
+ ret void
+}
+
+define void @test_readlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v7i32:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v9
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s10, v8, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:10]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v7i32:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v9
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s10
+; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s10
+; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s10
+; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s10
+; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s10
+; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s10
+; CHECK-GISEL-NEXT: v_readlane_b32 s10, v8, s10
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:10]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <7 x i32> @llvm.amdgcn.readlane.v7i32(<7 x i32> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<7 x i32> %x)
+ ret void
+}
+
+define void @test_readlane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_p0:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:5]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_p0:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v4
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s5
+; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s5
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:5]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call ptr @llvm.amdgcn.readlane.p0(ptr %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(ptr %x)
+ ret void
+}
+
+define void @test_readlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v3p0:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v8
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:9]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v3p0:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v8
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s9
+; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s9
+; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s9
+; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s9
+; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s9
+; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s9
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:9]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <3 x ptr> @llvm.amdgcn.readlane.v3p0(<3 x ptr> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<3 x ptr> %x)
+ ret void
+}
+
+define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v8i16:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v6
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:7]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v8i16:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v6
+; CHECK-GISEL-NEXT: s_nop 3
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s7
+; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s7
+; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s7
+; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s7
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; use s[4:7]
+; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %x = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<8 x i16> %x)
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
new file mode 100644
index 0000000000000..1b4ee84c75250
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s
+
+define void @test_readlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_p3:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call ptr addrspace(3) @llvm.amdgcn.readlane.p3(ptr addrspace(3) %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(ptr addrspace(3) %x)
+ ret void
+}
+
+define void @test_readlane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v3p3:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:6]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call <3 x ptr addrspace(3)> @llvm.amdgcn.readlane.v3p3(<3 x ptr addrspace(3)> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(3)> %x)
+ ret void
+}
+
+define void @test_readlane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_p5:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call ptr addrspace(5) @llvm.amdgcn.readlane.p5(ptr addrspace(5) %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %x)
+ ret void
+}
+
+define void @test_readlane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v3p5:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:6]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call <3 x ptr addrspace(5)> @llvm.amdgcn.readlane.v3p5(<3 x ptr addrspace(5)> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(5)> %x)
+ ret void
+}
+
+define void @test_readlane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_p6:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call ptr addrspace(6) @llvm.amdgcn.readlane.p6(ptr addrspace(6) %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(ptr addrspace(6) %x)
+ ret void
+}
+
+define void @test_readlane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v3p6:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:6]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call <3 x ptr addrspace(6)> @llvm.amdgcn.readlane.v3p6(<3 x ptr addrspace(6)> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(6)> %x)
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 37951669dbe75..d0a865f565eeb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -1,85 +1,3044 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,CIGFX9 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,CIGFX9 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX802-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s
+
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX802-GISEL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX1010-GISEL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 -global-isel < %s | FileCheck -check-prefixes=GFX1100-GISEL %s
declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0
+declare i64 @llvm.amdgcn.writelane.i64(i64, i32, i64) #0
+declare double @llvm.amdgcn.writelane.f64(double, i32, double) #0
-; CHECK-LABEL: {{^}}test_writelane_sreg:
-; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
-; GFX10: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_sreg(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
+define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_sreg_i32:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s3
+; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3
+; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, m0
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_i32:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_i32:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_i32:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s3
+; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, m0
+; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_i32:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_i32:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
%oldval = load i32, ptr addrspace(1) %out
- %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
+ %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 %oldval)
store i32 %writelane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_writelane_imm_sreg:
-; CHECK: v_writelane_b32 v{{[0-9]+}}, 32, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_imm_sreg(ptr addrspace(1) %out, i32 %src1) #1 {
+define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_sreg_i64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_i64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_i64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x10
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s2
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s2
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_i64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_i64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_i64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x10
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s2
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s2
+; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %oldval = load i64, ptr addrspace(1) %out
+ %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 %oldval)
+ store i64 %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_sreg_f64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_f64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_f64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x10
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s2
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s2
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_f64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_f64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_f64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x10
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s2
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s2
+; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %oldval = load double, ptr addrspace(1) %out
+ %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double %oldval)
+ store double %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_sreg_i32:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3
+; GFX802-SDAG-NEXT: v_writelane_b32 v2, 32, s2
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_i32:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 32, s2
+; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_i32:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 32, s0
+; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i32:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT: v_writelane_b32 v2, 32, s2
+; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_i32:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s3
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 32, s2
+; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_i32:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 32, s0
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
%oldval = load i32, ptr addrspace(1) %out
- %writelane = call i32 @llvm.amdgcn.writelane(i32 32, i32 %src1, i32 %oldval)
+ %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 32, i32 %src1, i32 %oldval)
store i32 %writelane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_writelane_vreg_lane:
-; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
-; CHECK: v_writelane_b32 v{{[0-9]+}}, 12, [[LANE]]
-define amdgpu_kernel void @test_writelane_vreg_lane(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_sreg_i64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x8
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, 0, s4
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, 32, s4
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_i64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x8
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, 0, s6
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 32, s6
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_i64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x8
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, 0, s4
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 32, s4
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_load_dword s4, s[4:5], 0x8
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-GISEL-NEXT: v_writelane_b32 v0, 32, s4
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, 0, s4
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_i64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x8
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 32, s6
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, 0, s6
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_i64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x8
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 32, s4
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, 0, s4
+; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %oldval = load i64, ptr addrspace(1) %out
+ %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 32, i32 %src1, i64 %oldval)
+ store i64 %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_sreg_f64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x8
+; GFX802-SDAG-NEXT: s_mov_b32 s5, 0x40400000
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, 0, s4
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_f64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x8
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX1010-SDAG-NEXT: s_mov_b32 s2, 0x40400000
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s2, s6
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 0, s6
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_f64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x8
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX1100-SDAG-NEXT: s_mov_b32 s0, 0x40400000
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s0, s4
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 0, s4
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_sreg_f64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_load_dword s4, s[4:5], 0x8
+; GFX802-GISEL-NEXT: s_mov_b32 s5, 0x40400000
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s4
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-GISEL-NEXT: v_writelane_b32 v0, 0, s4
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_f64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x8
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1010-GISEL-NEXT: s_mov_b32 s2, 0x40400000
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 0, s6
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s2, s6
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_f64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x8
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1100-GISEL-NEXT: s_mov_b32 s0, 0x40400000
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 0, s4
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s0, s4
+; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %oldval = load double, ptr addrspace(1) %out
+ %writelane = call double @llvm.amdgcn.writelane.f64(double 32.0, i32 %src1, double %oldval)
+ store double %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GFX802-SDAG-LABEL: test_writelane_vreg_lane_i32:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, 4, v0
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: flat_load_dword v0, v[0:1]
+; GFX802-SDAG-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT: s_nop 2
+; GFX802-SDAG-NEXT: v_writelane_b32 v2, 12, s2
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_i32:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dword v0, v0, s[2:3] offset:4
+; GFX1010-SDAG-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-SDAG-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, 12, s2
+; GFX1010-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_i32:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] offset:4
+; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, 12, s2
+; GFX1100-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i32:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, 4, v0
+; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT: flat_load_dword v0, v[0:1]
+; GFX802-GISEL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT: s_nop 2
+; GFX802-GISEL-NEXT: v_writelane_b32 v2, 12, s2
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_i32:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_dword v0, v0, s[2:3] offset:4
+; GFX1010-GISEL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-GISEL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, 12, s2
+; GFX1010-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_i32:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] offset:4
+; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, 12, s2
+; GFX1100-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.in = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 %tid
%args = load <2 x i32>, ptr addrspace(1) %gep.in
%oldval = load i32, ptr addrspace(1) %out
%lane = extractelement <2 x i32> %args, i32 1
- %writelane = call i32 @llvm.amdgcn.writelane(i32 12, i32 %lane, i32 %oldval)
+ %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 12, i32 %lane, i32 %oldval)
store i32 %writelane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_writelane_m0_sreg:
-; CHECK: s_mov_b32 m0, -1
-; CIGFX9: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
-; CIGFX9: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], m0
-; GFX10: v_writelane_b32 v{{[0-9]+}}, m0, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_m0_sreg(ptr addrspace(1) %out, i32 %src1) #1 {
+define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GFX802-SDAG-LABEL: test_writelane_vreg_lane_i64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, 8, v0
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1]
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s2, v2
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: s_nop 2
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, 0, s2
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, 12, s2
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_i64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dword v0, v0, s[2:3] offset:8
+; GFX1010-SDAG-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, 0, s3
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 12, s3
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_i64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] offset:8
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, 0, s3
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 12, s3
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, 8, v0
+; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s0
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX802-GISEL-NEXT: s_nop 3
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, 12, s2
+; GFX802-GISEL-NEXT: v_writelane_b32 v2, 0, s2
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_i64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:8
+; GFX1010-GISEL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, 12, s2
+; GFX1010-GISEL-NEXT: v_writelane_b32 v2, 0, s2
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_i64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:8
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, 12, s2
+; GFX1100-GISEL-NEXT: v_writelane_b32 v2, 0, s2
+; GFX1100-GISEL-NEXT: global_store_b64 v0, v[1:2], s[0:1]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.in = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid
+ %args = load <2 x i64>, ptr addrspace(1) %gep.in
+ %oldval = load i64, ptr addrspace(1) %out
+ %lane = extractelement <2 x i64> %args, i32 1
+ %lane32 = trunc i64 %lane to i32
+ %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 12, i32 %lane32, i64 %oldval)
+ store i64 %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GFX802-SDAG-LABEL: test_writelane_vreg_lane_f64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, 8, v0
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1]
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v2
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s2, v2
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, s4, m0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, 0, s2
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_f64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dword v0, v0, s[2:3] offset:8
+; GFX1010-SDAG-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX1010-SDAG-NEXT: s_mov_b32 s2, 0x40280000
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s2, s3
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 0, s3
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_f64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] offset:8
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX1100-SDAG-NEXT: s_mov_b32 s2, 0x40280000
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s2, s3
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 0, s3
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_vreg_lane_f64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, 8, v0
+; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s0
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s2
+; GFX802-GISEL-NEXT: s_nop 2
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, 0, s2
+; GFX802-GISEL-NEXT: v_writelane_b32 v2, s4, m0
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_f64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:8
+; GFX1010-GISEL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX1010-GISEL-NEXT: s_mov_b32 s3, 0x40280000
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, 0, s2
+; GFX1010-GISEL-NEXT: v_writelane_b32 v2, s3, s2
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_f64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:8
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX1100-GISEL-NEXT: s_mov_b32 s3, 0x40280000
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, 0, s2
+; GFX1100-GISEL-NEXT: v_writelane_b32 v2, s3, s2
+; GFX1100-GISEL-NEXT: global_store_b64 v0, v[1:2], s[0:1]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.in = getelementptr <2 x double>, ptr addrspace(1) %in, i32 %tid
+ %args = load <2 x double>, ptr addrspace(1) %gep.in
+ %oldval = load double, ptr addrspace(1) %out
+ %lane = extractelement <2 x double> %args, i32 1
+ %lane_cast = bitcast double %lane to i64
+ %lane32 = trunc i64 %lane_cast to i32
+ %writelane = call double @llvm.amdgcn.writelane.f64(double 12.0, i32 %lane32, double %oldval)
+ store double %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_m0_sreg_i32:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX802-SDAG-NEXT: ;;#ASMSTART
+; GFX802-SDAG-NEXT: s_mov_b32 m0, -1
+; GFX802-SDAG-NEXT: ;;#ASMEND
+; GFX802-SDAG-NEXT: s_mov_b32 s4, m0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s2
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3
+; GFX802-SDAG-NEXT: v_writelane_b32 v2, s4, m0
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_m0_sreg_i32:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT: ;;#ASMSTART
+; GFX1010-SDAG-NEXT: s_mov_b32 m0, -1
+; GFX1010-SDAG-NEXT: ;;#ASMEND
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, m0, s2
+; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_m0_sreg_i32:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT: ;;#ASMSTART
+; GFX1100-SDAG-NEXT: s_mov_b32 m0, -1
+; GFX1100-SDAG-NEXT: ;;#ASMEND
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, m0, s0
+; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_m0_sreg_i32:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX802-GISEL-NEXT: ;;#ASMSTART
+; GFX802-GISEL-NEXT: s_mov_b32 m0, -1
+; GFX802-GISEL-NEXT: ;;#ASMEND
+; GFX802-GISEL-NEXT: s_mov_b32 s4, m0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s2
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT: v_writelane_b32 v2, s4, m0
+; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_m0_sreg_i32:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX1010-GISEL-NEXT: ;;#ASMSTART
+; GFX1010-GISEL-NEXT: s_mov_b32 m0, -1
+; GFX1010-GISEL-NEXT: ;;#ASMEND
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s3
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, m0, s2
+; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_m0_sreg_i32:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX1100-GISEL-NEXT: ;;#ASMSTART
+; GFX1100-GISEL-NEXT: s_mov_b32 m0, -1
+; GFX1100-GISEL-NEXT: ;;#ASMEND
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, m0, s0
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
%oldval = load i32, ptr addrspace(1) %out
%m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
- %writelane = call i32 @llvm.amdgcn.writelane(i32 %m0, i32 %src1, i32 %oldval)
+ %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %m0, i32 %src1, i32 %oldval)
store i32 %writelane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_writelane_imm:
-; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 32
-define amdgpu_kernel void @test_writelane_imm(ptr addrspace(1) %out, i32 %src0) #1 {
+define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %src0) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_i32:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3
+; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, 32
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_i32:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, 32
+; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_i32:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s0, 32
+; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_i32:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, 32
+; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_i32:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s3
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, 32
+; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_i32:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s0, 32
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
%oldval = load i32, ptr addrspace(1) %out
- %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 32, i32 %oldval) #0
+ %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 32, i32 %oldval) #0
store i32 %writelane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_writelane_sreg_oldval:
-; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], s{{[0-9]+}}
-; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0
-; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_sreg_oldval(i32 inreg %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
- %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
+define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %src0) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_i64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, 32
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, 32
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_i64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, 32
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, 32
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_i64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, 32
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, 32
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_i64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, 32
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, 32
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_i64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, 32
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, 32
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_i64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, 32
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, 32
+; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %oldval = load i64, ptr addrspace(1) %out
+ %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 32, i64 %oldval) #0
+ store i64 %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double %src0) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_f64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, 32
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, 32
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_f64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, 32
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, 32
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_f64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, 32
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, 32
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_f64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, 32
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, 32
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_f64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, 32
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, 32
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_f64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, 32
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, 32
+; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %oldval = load double, ptr addrspace(1) %out
+ %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 32, double %oldval) #0
+ store double %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i32:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s6
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s3
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, m0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i32:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_i32:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x8
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i32:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s3
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i32:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_i32:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x8
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 %oldval)
store i32 %writelane, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}test_writelane_imm_oldval:
-; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], 42
-; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0
-; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_imm_oldval(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
- %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 42)
+define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x18
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v2, s4, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x2
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX1010-SDAG-NEXT: s_load_dword s8, s[4:5], 0x18
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s7, s8
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s6, s8
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_i64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x2
+; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x10
+; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x18
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, s0
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s0
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[6:7]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x18
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x2
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX1010-GISEL-NEXT: s_load_dword s8, s[4:5], 0x18
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s6, s8
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s7, s8
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_i64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x2
+; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x10
+; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x18
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s0
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, s0
+; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[6:7]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 %oldval)
+ store i64 %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_f64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x18
+; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v2, s4, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_f64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x2
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX1010-SDAG-NEXT: s_load_dword s8, s[4:5], 0x18
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s7, s8
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s6, s8
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_f64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x2
+; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x10
+; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x18
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, s0
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s0
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[6:7]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_f64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x18
+; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_f64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x2
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX1010-GISEL-NEXT: s_load_dword s8, s[4:5], 0x18
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s6, s8
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s7, s8
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_f64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x2
+; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x10
+; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x18
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s0
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, s0
+; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[6:7]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double %oldval)
+ store double %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, 42
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s3
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, m0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i32:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 42
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_i32:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 42
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, 42
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s3
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_i32:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 42
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_i32:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 42
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s3
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 42)
store i32 %writelane, ptr addrspace(1) %out, align 4
ret void
}
+define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x10
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 42
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 42
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_i64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x10
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 42
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s0
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s0
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 42
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_i64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 42
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_i64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x10
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 42
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s0
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s0
+; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 42)
+ store i64 %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
+; GFX802-SDAG-LABEL: test_writelane_imm_oldval_f64:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x10
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT: s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_f64:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6
+; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT: s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_f64:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x10
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s0
+; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s0
+; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-SDAG-NEXT: s_nop 0
+; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT: s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_oldval_f64:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000
+; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT: s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_f64:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000
+; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6
+; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT: s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_f64:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x10
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s0
+; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s0
+; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-GISEL-NEXT: s_nop 0
+; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT: s_endpgm
+ %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double 42.0)
+ store double %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_half(ptr addrspace(1) %out, half %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_half:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_ushort v4, v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT: flat_store_short v[0:1], v4
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_half:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_ushort v4, v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT: global_store_short v[0:1], v4, off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_half:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_u16 v4, v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_half:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: flat_load_ushort v4, v[0:1]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT: flat_store_short v[0:1], v4
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_half:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_ushort v4, v[0:1], off
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT: global_store_short v[0:1], v4, off
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_half:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_u16 v4, v[0:1], off
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT: global_store_b16 v[0:1], v4, off
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load half, ptr addrspace(1) %out
+ %writelane = call half @llvm.amdgcn.writelane.f16(half %src, i32 %src1, half %oldval)
+ store half %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_float(ptr addrspace(1) %out, float %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_float:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_float:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT: global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_float:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_float:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: flat_load_dword v4, v[0:1]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v4
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_float:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_dword v4, v[0:1], off
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT: global_store_dword v[0:1], v4, off
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_float:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_b32 v4, v[0:1], off
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load float, ptr addrspace(1) %out
+ %writelane = call float @llvm.amdgcn.writelane.f32(float %src, i32 %src1, float %oldval)
+ store float %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_bfloat:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_ushort v4, v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT: flat_store_short v[0:1], v4
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_bfloat:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_ushort v4, v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT: global_store_short v[0:1], v4, off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_bfloat:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_u16 v4, v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_bfloat:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: flat_load_ushort v4, v[0:1]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT: flat_store_short v[0:1], v4
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_bfloat:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_ushort v4, v[0:1], off
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT: global_store_short v[0:1], v4, off
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_bfloat:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_u16 v4, v[0:1], off
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT: global_store_b16 v[0:1], v4, off
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load bfloat, ptr addrspace(1) %out
+ %writelane = call bfloat @llvm.amdgcn.writelane.bf16(bfloat %src, i32 %src1, bfloat %oldval)
+ store bfloat %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_i16:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_ushort v4, v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT: flat_store_short v[0:1], v4
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_i16:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_ushort v4, v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT: global_store_short v[0:1], v4, off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_i16:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_u16 v4, v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_i16:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: flat_load_ushort v4, v[0:1]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT: flat_store_short v[0:1], v4
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_i16:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_ushort v4, v[0:1], off
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT: global_store_short v[0:1], v4, off
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_i16:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_u16 v4, v[0:1], off
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT: global_store_b16 v[0:1], v4, off
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load i16, ptr addrspace(1) %out
+ %writelane = call i16 @llvm.amdgcn.writelane.i16(i16 %src, i32 %src1, i16 %oldval)
+ store i16 %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v2f16:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v2f16:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT: global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v2f16:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_v2f16:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: flat_load_dword v4, v[0:1]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v4
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_v2f16:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_dword v4, v[0:1], off
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT: global_store_dword v[0:1], v4, off
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_v2f16:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_b32 v4, v[0:1], off
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <2 x half>, ptr addrspace(1) %out
+ %writelane = call <2 x half> @llvm.amdgcn.writelane.v2f16(<2 x half> %src, i32 %src1, <2 x half> %oldval)
+ store <2 x half> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_readlane_v2f32:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dwordx2 v[5:6], v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 0
+; GFX802-SDAG-NEXT: v_writelane_b32 v6, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v5, s5, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[5:6]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_readlane_v2f32:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dwordx2 v[5:6], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v2
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v6, s4, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v5, s6, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v[0:1], v[5:6], off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_readlane_v2f32:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b64 v[5:6], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v2
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s0, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v5, s2, s1
+; GFX1100-SDAG-NEXT: global_store_b64 v[0:1], v[5:6], off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_readlane_v2f32:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: flat_load_dwordx2 v[5:6], v[0:1]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v4
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v5, s4, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v6, s6, m0
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[5:6]
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_readlane_v2f32:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_dwordx2 v[5:6], v[0:1], off
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v4
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v5, s4, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v6, s6, s5
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v[0:1], v[5:6], off
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_readlane_v2f32:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_b64 v[5:6], v[0:1], off
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v4
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v5, s0, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v6, s2, s1
+; GFX1100-GISEL-NEXT: global_store_b64 v[0:1], v[5:6], off
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <2 x float>, ptr addrspace(1) %out
+ %writelane = call <2 x float> @llvm.amdgcn.writelane.v2f32(<2 x float> %src, i32 %src1, <2 x float> %oldval)
+ store <2 x float> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v7i32:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_add_u32_e32 v17, vcc, 16, v0
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[10:13], v[0:1]
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: flat_load_dwordx3 v[14:16], v[17:18]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v9
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v5
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v2
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v8
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v7
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v6
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX802-SDAG-NEXT: v_writelane_b32 v13, s7, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v12, s8, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v11, s9, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v10, s10, m0
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_writelane_b32 v16, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v15, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v14, s6, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
+; GFX802-SDAG-NEXT: flat_store_dwordx3 v[17:18], v[14:16]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v7i32:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: global_load_dwordx3 v[14:16], v[0:1], off offset:16
+; GFX1010-SDAG-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v9
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v5
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s9, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s10, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s11, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v8
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v7
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v6
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v16, s4, s5
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v13, s8, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v12, s9, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v11, s10, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v10, s11, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v15, s6, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v14, s7, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[10:13], off
+; GFX1010-SDAG-NEXT: global_store_dwordx3 v[0:1], v[14:16], off offset:16
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v7i32:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: global_load_b96 v[14:16], v[0:1], off offset:16
+; GFX1100-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v9
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s5, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s7, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v8
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v7
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v6
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v16, s0, s1
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v13, s4, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v12, s5, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v11, s6, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v10, s7, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v15, s2, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v14, s3, s1
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off
+; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[14:16], off offset:16
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_v7i32:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_add_u32_e32 v18, vcc, 16, v0
+; GFX802-GISEL-NEXT: flat_load_dwordx4 v[10:13], v[0:1]
+; GFX802-GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT: flat_load_dwordx4 v[14:17], v[18:19]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v9
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s9, v6
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s10, v7
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s11, v8
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1)
+; GFX802-GISEL-NEXT: v_writelane_b32 v10, s4, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v11, s6, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v12, s7, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v13, s8, m0
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v14, s9, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v15, s10, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v16, s11, m0
+; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
+; GFX802-GISEL-NEXT: flat_store_dwordx3 v[18:19], v[14:16]
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_v7i32:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
+; GFX1010-GISEL-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:16
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v9
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v4
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v5
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s9, v6
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s10, v7
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s11, v8
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(1)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v10, s4, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v11, s6, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v12, s7, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v13, s8, s5
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v14, s9, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v15, s10, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v16, s11, s5
+; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off
+; GFX1010-GISEL-NEXT: global_store_dwordx3 v[0:1], v[14:16], off offset:16
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_v7i32:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off
+; GFX1100-GISEL-NEXT: global_load_b128 v[14:17], v[0:1], off offset:16
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v9
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v5
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s5, v6
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s6, v7
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s7, v8
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(1)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v10, s0, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v11, s2, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v12, s3, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v13, s4, s1
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v14, s5, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v15, s6, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v16, s7, s1
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off
+; GFX1100-GISEL-NEXT: global_store_b96 v[0:1], v[14:16], off offset:16
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <7 x i32>, ptr addrspace(1) %out
+ %writelane = call <7 x i32> @llvm.amdgcn.writelane.v7i32(<7 x i32> %src, i32 %src1, <7 x i32> %oldval)
+ store <7 x i32> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_p0:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dwordx2 v[5:6], v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 0
+; GFX802-SDAG-NEXT: v_writelane_b32 v6, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v5, s5, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[5:6]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_p0:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dwordx2 v[5:6], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v2
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v6, s4, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v5, s6, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v[0:1], v[5:6], off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_p0:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b64 v[5:6], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v2
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s0, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v5, s2, s1
+; GFX1100-SDAG-NEXT: global_store_b64 v[0:1], v[5:6], off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_p0:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: flat_load_dwordx2 v[5:6], v[0:1]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v4
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v5, s4, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v6, s6, m0
+; GFX802-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[5:6]
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_p0:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_dwordx2 v[5:6], v[0:1], off
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v4
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v5, s4, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v6, s6, s5
+; GFX1010-GISEL-NEXT: global_store_dwordx2 v[0:1], v[5:6], off
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_p0:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_b64 v[5:6], v[0:1], off
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v4
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v5, s0, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v6, s2, s1
+; GFX1100-GISEL-NEXT: global_store_b64 v[0:1], v[5:6], off
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load ptr, ptr addrspace(1) %out
+ %writelane = call ptr @llvm.amdgcn.writelane.p0(ptr %src, i32 %src1, ptr %oldval)
+ store ptr %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_v3p0(ptr addrspace(1) %out, <4 x ptr> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v3p0:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_add_u32_e32 v19, vcc, 16, v0
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[11:14], v[0:1]
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v20, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[15:18], v[19:20]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v10
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v5
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s11, v2
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v9
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v8
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v7
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v6
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX802-SDAG-NEXT: v_writelane_b32 v14, s8, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v13, s9, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v12, s10, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v11, s11, m0
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_writelane_b32 v18, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v17, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v16, s6, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v15, s7, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[11:14]
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[19:20], v[15:18]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v3p0:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:16
+; GFX1010-SDAG-NEXT: global_load_dwordx4 v[15:18], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v10
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s9, v5
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s10, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s11, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s12, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v9
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v8
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v7
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v6
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v14, s4, s5
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v18, s9, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v17, s10, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v16, s11, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v15, s12, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v13, s6, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v12, s7, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v11, s8, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
+; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[11:14], off offset:16
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v3p0:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: global_load_b128 v[11:14], v[0:1], off offset:16
+; GFX1100-SDAG-NEXT: global_load_b128 v[15:18], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s5, v5
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s7, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s8, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v9
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v8
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v7
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v6
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v14, s0, s1
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v18, s5, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v17, s6, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v16, s7, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v15, s8, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v13, s2, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v12, s3, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v11, s4, s1
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[15:18], off
+; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[11:14], off offset:16
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_v3p0:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: v_add_u32_e32 v19, vcc, 16, v0
+; GFX802-GISEL-NEXT: flat_load_dwordx4 v[11:14], v[0:1]
+; GFX802-GISEL-NEXT: v_addc_u32_e32 v20, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT: flat_load_dwordx4 v[15:18], v[19:20]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v10
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s9, v6
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s10, v7
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s11, v8
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s12, v9
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1)
+; GFX802-GISEL-NEXT: v_writelane_b32 v11, s4, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v12, s6, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v13, s7, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v14, s8, m0
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v15, s9, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v16, s10, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v17, s11, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v18, s12, m0
+; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[11:14]
+; GFX802-GISEL-NEXT: flat_store_dwordx4 v[19:20], v[15:18]
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_v3p0:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: s_clause 0x1
+; GFX1010-GISEL-NEXT: global_load_dwordx4 v[11:14], v[0:1], off
+; GFX1010-GISEL-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:16
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v10
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v4
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v5
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s9, v6
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s10, v7
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s11, v8
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s12, v9
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(1)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v11, s4, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v12, s6, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v13, s7, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v14, s8, s5
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v15, s9, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v16, s10, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v17, s11, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v18, s12, s5
+; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
+; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[15:18], off offset:16
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_v3p0:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: global_load_b128 v[11:14], v[0:1], off
+; GFX1100-GISEL-NEXT: global_load_b128 v[15:18], v[0:1], off offset:16
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v5
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s5, v6
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s6, v7
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s7, v8
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s8, v9
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(1)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v11, s0, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v12, s2, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v13, s3, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v14, s4, s1
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v15, s5, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v16, s6, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v17, s7, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v18, s8, s1
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[11:14], off
+; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[15:18], off offset:16
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <4 x ptr>, ptr addrspace(1) %out
+ %writelane = call <4 x ptr> @llvm.amdgcn.writelane.v3p0(<4 x ptr> %src, i32 %src1, <4 x ptr> %oldval)
+ store <4 x ptr> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v8i16:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[7:10], v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v6
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_writelane_b32 v10, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v9, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v8, s6, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v7, s7, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v8i16:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dwordx4 v[7:10], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v6
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v2
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v10, s4, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v9, s6, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v8, s7, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v7, s8, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v8i16:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b128 v[7:10], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v5
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v6
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v10, s0, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v9, s2, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s3, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s4, s1
+; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[7:10], off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_v8i16:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: flat_load_dwordx4 v[7:10], v[0:1]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v6
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v7, s4, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v8, s6, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v9, s7, m0
+; GFX802-GISEL-NEXT: v_writelane_b32 v10, s8, m0
+; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_v8i16:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_dwordx4 v[7:10], v[0:1], off
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v6
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v4
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v5
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v7, s4, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v8, s6, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v9, s7, s5
+; GFX1010-GISEL-NEXT: v_writelane_b32 v10, s8, s5
+; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_v8i16:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_b128 v[7:10], v[0:1], off
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v6
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v5
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v7, s0, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v8, s2, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v9, s3, s1
+; GFX1100-GISEL-NEXT: v_writelane_b32 v10, s4, s1
+; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[7:10], off
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <8 x i16>, ptr addrspace(1) %out
+ %writelane = call <8 x i16> @llvm.amdgcn.writelane.v8i16(<8 x i16> %src, i32 %src1, <8 x i16> %oldval)
+ store <8 x i16> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
new file mode 100644
index 0000000000000..afc394627d356
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
@@ -0,0 +1,292 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX802-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s
+
+define void @test_writelane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_p3:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_p3:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT: global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_p3:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load ptr addrspace(3), ptr addrspace(1) %out
+ %writelane = call ptr addrspace(3) @llvm.amdgcn.writelane.p3(ptr addrspace(3) %src, i32 %src1, ptr addrspace(3) %oldval)
+ store ptr addrspace(3) %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v3p3:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dwordx3 v[6:8], v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v5
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_writelane_b32 v8, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v7, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v6, s6, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx3 v[0:1], v[6:8]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v3p3:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dwordx3 v[6:8], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v5
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v2
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v8, s4, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v7, s6, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v6, s7, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx3 v[0:1], v[6:8], off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v3p3:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b96 v[6:8], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v5
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v2
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s0, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s2, s1
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s3, s1
+; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <3 x ptr addrspace(3)>, ptr addrspace(1) %out
+ %writelane = call <3 x ptr addrspace(3)> @llvm.amdgcn.writelane.v3p3(<3 x ptr addrspace(3)> %src, i32 %src1, <3 x ptr addrspace(3)> %oldval)
+ store <3 x ptr addrspace(3)> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_p5:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_p5:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT: global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_p5:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load ptr addrspace(5), ptr addrspace(1) %out
+ %writelane = call ptr addrspace(5) @llvm.amdgcn.writelane.p5(ptr addrspace(5) %src, i32 %src1, ptr addrspace(5) %oldval)
+ store ptr addrspace(5) %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v3p5:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dwordx3 v[6:8], v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v5
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_writelane_b32 v8, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v7, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v6, s6, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx3 v[0:1], v[6:8]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v3p5:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dwordx3 v[6:8], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v5
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v2
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v8, s4, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v7, s6, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v6, s7, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx3 v[0:1], v[6:8], off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v3p5:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b96 v[6:8], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v5
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v2
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s0, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s2, s1
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s3, s1
+; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <3 x ptr addrspace(5)>, ptr addrspace(1) %out
+ %writelane = call <3 x ptr addrspace(5)> @llvm.amdgcn.writelane.v3p5(<3 x ptr addrspace(5)> %src, i32 %src1, <3 x ptr addrspace(5)> %oldval)
+ store <3 x ptr addrspace(5)> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_p6:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_p6:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT: global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_p6:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load ptr addrspace(6), ptr addrspace(1) %out
+ %writelane = call ptr addrspace(6) @llvm.amdgcn.writelane.p6(ptr addrspace(6) %src, i32 %src1, ptr addrspace(6) %oldval)
+ store ptr addrspace(6) %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v3p6:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dwordx3 v[6:8], v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v5
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_writelane_b32 v8, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v7, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v6, s6, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx3 v[0:1], v[6:8]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v3p6:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dwordx3 v[6:8], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v5
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v2
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v8, s4, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v7, s6, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v6, s7, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx3 v[0:1], v[6:8], off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v3p6:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b96 v[6:8], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v5
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v2
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s0, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s2, s1
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s3, s1
+; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <3 x ptr addrspace(6)>, ptr addrspace(1) %out
+ %writelane = call <3 x ptr addrspace(6)> @llvm.amdgcn.writelane.v3p6(<3 x ptr addrspace(6)> %src, i32 %src1, <3 x ptr addrspace(6)> %oldval)
+ store <3 x ptr addrspace(6)> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index 94c32e3cbe99f..483ea8ad57d1b 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -2714,7 +2714,7 @@ declare i32 @llvm.amdgcn.readfirstlane(i32)
define amdgpu_kernel void @readfirstlane_constant(i32 %arg) {
; CHECK-LABEL: @readfirstlane_constant(
-; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
; CHECK-NEXT: store volatile i32 [[VAR]], ptr undef, align 4
; CHECK-NEXT: store volatile i32 0, ptr undef, align 4
; CHECK-NEXT: store volatile i32 123, ptr undef, align 4
@@ -2737,7 +2737,7 @@ define amdgpu_kernel void @readfirstlane_constant(i32 %arg) {
define i32 @readfirstlane_idempotent(i32 %arg) {
; CHECK-LABEL: @readfirstlane_idempotent(
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
; CHECK-NEXT: ret i32 [[READ0]]
;
%read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
@@ -2748,7 +2748,7 @@ define i32 @readfirstlane_idempotent(i32 %arg) {
define i32 @readfirstlane_readlane(i32 %arg) {
; CHECK-LABEL: @readfirstlane_readlane(
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
; CHECK-NEXT: ret i32 [[READ0]]
;
%read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
@@ -2759,10 +2759,10 @@ define i32 @readfirstlane_readlane(i32 %arg) {
define i32 @readfirstlane_readfirstlane_different_block(i32 %arg) {
; CHECK-LABEL: @readfirstlane_readfirstlane_different_block(
; CHECK-NEXT: bb0:
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]])
+; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[READ0]])
; CHECK-NEXT: ret i32 [[READ1]]
;
bb0:
@@ -2777,10 +2777,10 @@ bb1:
define i32 @readfirstlane_readlane_different_block(i32 %arg) {
; CHECK-LABEL: @readfirstlane_readlane_different_block(
; CHECK-NEXT: bb0:
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 0)
+; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 0)
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]])
+; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[READ0]])
; CHECK-NEXT: ret i32 [[READ1]]
;
bb0:
@@ -2800,7 +2800,7 @@ declare i32 @llvm.amdgcn.readlane(i32, i32)
define amdgpu_kernel void @readlane_constant(i32 %arg, i32 %lane) {
; CHECK-LABEL: @readlane_constant(
-; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 7)
+; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 7)
; CHECK-NEXT: store volatile i32 [[VAR]], ptr undef, align 4
; CHECK-NEXT: store volatile i32 0, ptr undef, align 4
; CHECK-NEXT: store volatile i32 123, ptr undef, align 4
@@ -2823,7 +2823,7 @@ define amdgpu_kernel void @readlane_constant(i32 %arg, i32 %lane) {
define i32 @readlane_idempotent(i32 %arg, i32 %lane) {
; CHECK-LABEL: @readlane_idempotent(
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
+; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
; CHECK-NEXT: ret i32 [[READ0]]
;
%read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane)
@@ -2833,8 +2833,8 @@ define i32 @readlane_idempotent(i32 %arg, i32 %lane) {
define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1) {
; CHECK-LABEL: @readlane_idempotent_different_lanes(
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE0:%.*]])
-; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE1:%.*]])
+; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE0:%.*]])
+; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 [[LANE1:%.*]])
; CHECK-NEXT: ret i32 [[READ1]]
;
%read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane0)
@@ -2844,7 +2844,7 @@ define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1
define i32 @readlane_readfirstlane(i32 %arg) {
; CHECK-LABEL: @readlane_readfirstlane(
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
; CHECK-NEXT: ret i32 [[READ0]]
;
%read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
@@ -2855,10 +2855,10 @@ define i32 @readlane_readfirstlane(i32 %arg) {
define i32 @readlane_idempotent_different_block(i32 %arg, i32 %lane) {
; CHECK-LABEL: @readlane_idempotent_different_block(
; CHECK-NEXT: bb0:
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
+; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE]])
+; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 [[LANE]])
; CHECK-NEXT: ret i32 [[READ1]]
;
bb0:
@@ -2874,10 +2874,10 @@ bb1:
define i32 @readlane_readfirstlane_different_block(i32 %arg) {
; CHECK-LABEL: @readlane_readfirstlane_different_block(
; CHECK-NEXT: bb0:
-; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 0)
+; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 0)
; CHECK-NEXT: ret i32 [[READ1]]
;
bb0:
>From 881e11673545164b0172c42ffd0fbba47ebe38dd Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Sat, 18 May 2024 05:28:50 -0400
Subject: [PATCH 02/11] [AMDGPU] Extend permlane16, permlanex16 and permlane64
intrinsic lowering for generic types
---
clang/lib/CodeGen/CGBuiltin.cpp | 19 +
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 10 +-
.../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 6 +-
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 3 +
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 3 +
llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 25 +
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 74 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 49 +-
llvm/lib/Target/AMDGPU/VOP1Instructions.td | 10 +-
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 10 +-
.../UniformityAnalysis/AMDGPU/intrinsics.ll | 12 +-
.../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 2694 +++++++++++++++--
.../InstCombine/AMDGPU/amdgcn-intrinsics.ll | 28 +-
llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll | 20 +-
14 files changed, 2671 insertions(+), 292 deletions(-)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a275ada586d13..9ce2f5b6c103b 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18479,6 +18479,25 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
return Builder.CreateCall(F, Args);
}
+ case AMDGPU::BI__builtin_amdgcn_permlane16:
+ case AMDGPU::BI__builtin_amdgcn_permlanex16: {
+ Intrinsic::ID IID;
+ IID = BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
+ ? Intrinsic::amdgcn_permlane16
+ : Intrinsic::amdgcn_permlanex16;
+
+ llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
+ llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
+ llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
+ llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
+ llvm::Value *Src4 = EmitScalarExpr(E->getArg(4));
+ llvm::Value *Src5 = EmitScalarExpr(E->getArg(5));
+
+ llvm::Function *F = CGM.getIntrinsic(IID, Src1->getType());
+ return Builder.CreateCall(F, {Src0, Src1, Src2, Src3, Src4, Src5});
+ }
+ case AMDGPU::BI__builtin_amdgcn_permlane64:
+ return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_permlane64);
case AMDGPU::BI__builtin_amdgcn_readlane:
return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_readlane);
case AMDGPU::BI__builtin_amdgcn_readfirstlane:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 457566944069e..63f7f48e82e4a 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2488,15 +2488,15 @@ def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control>
def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">,
- Intrinsic<[llvm_i32_ty],
- [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
+ Intrinsic<[llvm_any_ty],
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">,
- Intrinsic<[llvm_i32_ty],
- [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
+ Intrinsic<[llvm_any_ty],
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
@@ -2540,7 +2540,7 @@ def int_amdgcn_image_bvh_intersect_ray :
// llvm.amdgcn.permlane64 <src0>
def int_amdgcn_permlane64 :
ClangBuiltin<"__builtin_amdgcn_permlane64">,
- Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
+ Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
def int_amdgcn_ds_add_gs_reg_rtn :
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 9ac0a52f63fd8..5fec5dafe2acd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -413,7 +413,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
assert(ST->hasPermLaneX16());
V = B.CreateBitCast(V, IntNTy);
Value *Permlanex16Call = B.CreateIntrinsic(
- Intrinsic::amdgcn_permlanex16, {},
+ V->getType(), Intrinsic::amdgcn_permlanex16,
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
B.CreateBitCast(Permlanex16Call, AtomicTy));
@@ -425,7 +425,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
// Reduce across the upper and lower 32 lanes.
V = B.CreateBitCast(V, IntNTy);
Value *Permlane64Call =
- B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V);
+ B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V);
return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
B.CreateBitCast(Permlane64Call, AtomicTy));
}
@@ -481,7 +481,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
assert(ST->hasPermLaneX16());
V = B.CreateBitCast(V, IntNTy);
Value *PermX = B.CreateIntrinsic(
- Intrinsic::amdgcn_permlanex16, {},
+ V->getType(), Intrinsic::amdgcn_permlanex16,
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
Value *UpdateDPPCall =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 9cd7496ba9f96..12a3ce486a75d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5499,6 +5499,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(READLANE)
NODE_NAME_CASE(READFIRSTLANE)
NODE_NAME_CASE(WRITELANE)
+ NODE_NAME_CASE(PERMLANE16)
+ NODE_NAME_CASE(PERMLANEX16)
+ NODE_NAME_CASE(PERMLANE64)
NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
NODE_NAME_CASE(LOAD_D16_HI)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 02c3dcf39e554..650ad948184d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -561,6 +561,9 @@ enum NodeType : unsigned {
READLANE,
READFIRSTLANE,
WRITELANE,
+ PERMLANE16,
+ PERMLANEX16,
+ PERMLANE64,
DUMMY_CHAIN,
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index e4f329b200c86..a82f48950f493 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -354,9 +354,21 @@ def AMDGPUDWritelaneOp : SDTypeProfile<1, 3, [
SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisSameAs<0, 3>
]>;
+def AMDGPUDPermlane16Op : SDTypeProfile<1, 6, [
+ SDTCisSameAs<0, 1>, // old
+ SDTCisSameAs<0, 2>, // src0
+ SDTCisInt<3>, // src1
+ SDTCisInt<4>, // src2
+ SDTCisInt<5>, // i1 fi
+ SDTCisInt<6> // i1 bound_ctrl
+]>;
+
def AMDGPUreadlane_impl : SDNode<"AMDGPUISD::READLANE", AMDGPUReadlaneOp>;
def AMDGPUreadfirstlane_impl : SDNode<"AMDGPUISD::READFIRSTLANE", AMDGPUReadfirstlaneOp>;
def AMDGPUwritelane_impl : SDNode<"AMDGPUISD::WRITELANE", AMDGPUDWritelaneOp>;
+def AMDGPUpermlane16_impl : SDNode<"AMDGPUISD::PERMLANE16", AMDGPUDPermlane16Op>;
+def AMDGPUpermlanex16_impl : SDNode<"AMDGPUISD::PERMLANEX16", AMDGPUDPermlane16Op>;
+def AMDGPUpermlane64_impl : SDNode<"AMDGPUISD::PERMLANE64", AMDGPUReadfirstlaneOp>;
// SI+ export
def AMDGPUExportOp : SDTypeProfile<0, 8, [
@@ -535,3 +547,16 @@ def AMDGPUwritelane : PatFrags<(ops node:$src0, node:$src1, node:$src2),
[(int_amdgcn_writelane node:$src0, node:$src1, node:$src2),
(AMDGPUwritelane_impl node:$src0, node:$src1, node:$src2)]>;
+def AMDGPUpermlane16 : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5),
+ [(int_amdgcn_permlane16 node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5),
+ (AMDGPUpermlane16_impl node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5)]>;
+
+def AMDGPUpermlanex16 : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5),
+ [(int_amdgcn_permlanex16 node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5),
+ (AMDGPUpermlanex16_impl node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5)]>;
+
+def AMDGPUpermlane64 : PatFrags<(ops node:$src),
+ [(int_amdgcn_permlane64 node:$src),
+ (AMDGPUpermlane64_impl node:$src)]>;
+
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 6ffc8a20f76fa..b28c3521d6336 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5397,25 +5397,42 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
Register DstReg = MI.getOperand(0).getReg();
Register Src0 = MI.getOperand(2).getReg();
+ bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
+ IID == Intrinsic::amdgcn_permlanex16;
+
auto createLaneOp = [&](Register Src0, Register Src1,
Register Src2) -> Register {
auto LaneOp = B.buildIntrinsic(IID, {S32}).addUse(Src0);
switch (IID) {
case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_permlane64:
return LaneOp.getReg(0);
case Intrinsic::amdgcn_readlane:
return LaneOp.addUse(Src1).getReg(0);
case Intrinsic::amdgcn_writelane:
return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16: {
+ Register Src3 = MI.getOperand(5).getReg();
+ Register Src4 = MI.getOperand(6).getImm();
+ Register Src5 = MI.getOperand(7).getImm();
+ return LaneOp.addUse(Src1)
+ .addUse(Src2)
+ .addUse(Src3)
+ .addImm(Src4)
+ .addImm(Src5)
+ .getReg(0);
+ }
default:
llvm_unreachable("unhandled lane op");
}
};
Register Src1, Src2;
- if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
+ if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
+ IsPermLane16) {
Src1 = MI.getOperand(3).getReg();
- if (IID == Intrinsic::amdgcn_writelane) {
+ if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
Src2 = MI.getOperand(4).getReg();
}
}
@@ -5433,7 +5450,16 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
? Src0
: B.buildBitcast(LLT::scalar(Size), Src0).getReg(0);
Src0 = B.buildAnyExt(S32, Src0Cast).getReg(0);
- if (Src2.isValid()) {
+
+ if (IsPermLane16) {
+ Register Src1Cast =
+ MRI.getType(Src1).isScalar()
+ ? Src1
+ : B.buildBitcast(LLT::scalar(Size), Src2).getReg(0);
+ Src1 = B.buildAnyExt(LLT::scalar(32), Src1Cast).getReg(0);
+ }
+
+ if (IID == Intrinsic::amdgcn_writelane) {
Register Src2Cast =
MRI.getType(Src2).isScalar()
? Src2
@@ -5485,46 +5511,45 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
}
break;
}
- case Intrinsic::amdgcn_readfirstlane: {
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_permlane64: {
for (unsigned i = 0; i < NumParts; ++i) {
Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
: Src0Parts.getReg(i);
PartialRes.push_back(
- (B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, {S32})
- .addUse(Src0)
- .getReg(0)));
+ (B.buildIntrinsic(IID, {S32}).addUse(Src0).getReg(0)));
}
break;
}
- case Intrinsic::amdgcn_writelane: {
+ case Intrinsic::amdgcn_writelane:
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16: {
Register Src1 = MI.getOperand(3).getReg();
Register Src2 = MI.getOperand(4).getReg();
- MachineInstrBuilder Src2Parts;
+
+ Register SrcX = IsPermLane16 ? Src1 : Src2;
+ MachineInstrBuilder SrcXParts;
if (Ty.isPointer()) {
- auto PtrToInt = B.buildPtrToInt(S64, Src2);
- Src2Parts = B.buildUnmerge(S32, PtrToInt);
+ auto PtrToInt = B.buildPtrToInt(S64, SrcX);
+ SrcXParts = B.buildUnmerge(S32, PtrToInt);
} else if (Ty.isPointerVector()) {
LLT IntVecTy = Ty.changeElementType(
LLT::scalar(Ty.getElementType().getSizeInBits()));
- auto PtrToInt = B.buildPtrToInt(IntVecTy, Src2);
- Src2Parts = B.buildUnmerge(S32, PtrToInt);
+ auto PtrToInt = B.buildPtrToInt(IntVecTy, SrcX);
+ SrcXParts = B.buildUnmerge(S32, PtrToInt);
} else
- Src2Parts =
- IsS16Vec ? B.buildUnmerge(V2S16, Src2) : B.buildUnmerge(S32, Src2);
+ SrcXParts =
+ IsS16Vec ? B.buildUnmerge(V2S16, SrcX) : B.buildUnmerge(S32, SrcX);
for (unsigned i = 0; i < NumParts; ++i) {
Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
: Src0Parts.getReg(i);
- Src2 = IsS16Vec ? B.buildBitcast(S32, Src2Parts.getReg(i)).getReg(0)
- : Src2Parts.getReg(i);
- PartialRes.push_back(
- (B.buildIntrinsic(Intrinsic::amdgcn_writelane, {S32})
- .addUse(Src0)
- .addUse(Src1)
- .addUse(Src2))
- .getReg(0));
+ SrcX = IsS16Vec ? B.buildBitcast(S32, SrcXParts.getReg(i)).getReg(0)
+ : SrcXParts.getReg(i);
+ PartialRes.push_back(IsPermLane16 ? createLaneOp(Src0, SrcX, Src2)
+ : createLaneOp(Src0, Src1, SrcX));
}
break;
@@ -7519,6 +7544,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_writelane:
case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16:
+ case Intrinsic::amdgcn_permlane64:
return legalizeLaneOp(Helper, MI, IntrID);
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6c23bdf09974b..5d34ed089f65d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6091,22 +6091,39 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
EVT VT = N->getValueType(0);
unsigned ValSize = VT.getSizeInBits();
unsigned IntrinsicID = N->getConstantOperandVal(0);
+ bool IsPermLane16 = IntrinsicID == Intrinsic::amdgcn_permlane16 ||
+ IntrinsicID == Intrinsic::amdgcn_permlanex16;
+ bool IsPermLane64 = IntrinsicID == Intrinsic::amdgcn_permlane64;
SDValue Src0 = N->getOperand(1);
SDLoc SL(N);
MVT IntVT = MVT::getIntegerVT(ValSize);
- auto createLaneOp = [&DAG, &SL](SDValue Src0, SDValue Src1, SDValue Src2,
- MVT VT) -> SDValue {
- return (Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, VT, {Src0, Src1, Src2})
- : Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, VT, {Src0, Src1})
- : DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, VT, {Src0}));
+ auto createLaneOp = [&](SDValue Src0, SDValue Src1, SDValue Src2,
+ MVT ValueT) -> SDValue {
+ if (IsPermLane16 || IsPermLane64) {
+ if (IsPermLane16) {
+ SDValue Src3 = N->getOperand(4);
+ SDValue Src4 = N->getOperand(5);
+ SDValue Src5 = N->getOperand(6);
+ return DAG.getNode(IntrinsicID == Intrinsic::amdgcn_permlane16
+ ? AMDGPUISD::PERMLANE16
+ : AMDGPUISD::PERMLANEX16,
+ SL, ValueT, {Src0, Src1, Src2, Src3, Src4, Src5});
+ }
+ return DAG.getNode(AMDGPUISD::PERMLANE64, SL, ValueT, {Src0});
+ }
+
+ return (
+ Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, ValueT, {Src0, Src1, Src2})
+ : Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, ValueT, {Src0, Src1})
+ : DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, ValueT, {Src0}));
};
SDValue Src1, Src2;
if (IntrinsicID == Intrinsic::amdgcn_readlane ||
- IntrinsicID == Intrinsic::amdgcn_writelane) {
+ IntrinsicID == Intrinsic::amdgcn_writelane || IsPermLane16) {
Src1 = N->getOperand(2);
- if (IntrinsicID == Intrinsic::amdgcn_writelane)
+ if (IntrinsicID == Intrinsic::amdgcn_writelane || IsPermLane16)
Src2 = N->getOperand(3);
}
@@ -6118,10 +6135,17 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
if (ValSize < 32) {
SDValue InitBitCast = DAG.getBitcast(IntVT, Src0);
Src0 = DAG.getAnyExtOrTrunc(InitBitCast, SL, MVT::i32);
- if (Src2.getNode()) {
+
+ if (IsPermLane16) {
+ SDValue Src1Cast = DAG.getBitcast(IntVT, Src1);
+ Src1 = DAG.getAnyExtOrTrunc(Src1Cast, SL, MVT::i32);
+ }
+
+ if (IntrinsicID == Intrinsic::amdgcn_writelane) {
SDValue Src2Cast = DAG.getBitcast(IntVT, Src2);
Src2 = DAG.getAnyExtOrTrunc(Src2Cast, SL, MVT::i32);
}
+
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
return DAG.getBitcast(VT, Trunc);
@@ -6131,7 +6155,11 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
Src0 = DAG.getBitcast(VecVT, Src0);
- if (Src2.getNode())
+ if (IsPermLane16) {
+ Src1 = DAG.getBitcast(VecVT, Src1);
+ }
+
+ if (IntrinsicID == Intrinsic::amdgcn_writelane)
Src2 = DAG.getBitcast(VecVT, Src2);
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
@@ -8612,6 +8640,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_writelane:
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16:
+ case Intrinsic::amdgcn_permlane64:
return lowerLaneOp(*this, Op.getNode(), DAG);
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 9c5d6a7bf6d0b..33d3da3830f57 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -717,15 +717,19 @@ def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1
let SubtargetPredicate = isGFX11Plus in {
// Restrict src0 to be VGPR
def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
- getVOP1Pat<int_amdgcn_permlane64,
- VOP_MOVRELS>.ret,
- /*VOP1Only=*/ 1>;
+ [], /*VOP1Only=*/ 1>;
defm V_MOV_B16_t16 : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16<VOP_I16_I16>>;
defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>;
} // End SubtargetPredicate = isGFX11Plus
+foreach vt = Reg32Types.types in {
+ def : GCNPat<(AMDGPUpermlane64 (vt VRegSrc_32:$src0)),
+ (vt (V_PERMLANE64_B32 (vt VRegSrc_32:$src0)))
+ >;
+}
+
//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 616bc7684753e..eee2085eab341 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -838,8 +838,8 @@ def gi_opsel_i1timm : GICustomOperandRenderer<"renderOpSelTImm">,
GISDNodeXFormEquiv<opsel_i1timm>;
class PermlanePat<SDPatternOperator permlane,
- Instruction inst> : GCNPat<
- (permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2,
+ Instruction inst, ValueType vt> : GCNPat<
+ (permlane vt:$vdst_in, vt:$src0, i32:$src1, i32:$src2,
timm:$fi, timm:$bc),
(inst (opsel_i1timm $fi), VGPR_32:$src0, (opsel_i1timm $bc),
SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in)
@@ -864,8 +864,10 @@ let SubtargetPredicate = isGFX10Plus in {
defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
} // End $vdst = $vdst_in, DisableEncoding $vdst_in
- def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64>;
- def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64>;
+ foreach vt = Reg32Types.types in {
+ def : PermlanePat<AMDGPUpermlane16, V_PERMLANE16_B32_e64, vt>;
+ def : PermlanePat<AMDGPUpermlanex16, V_PERMLANEX16_B32_e64, vt>;
+ }
defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>;
defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 74d2f53d7b317..680c998a4b39f 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -7,16 +7,16 @@ define amdgpu_kernel void @ds_swizzle(ptr addrspace(1) %out, i32 %src) #0 {
ret void
}
-; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
define amdgpu_kernel void @v_permlane16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
- %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
store i32 %v, ptr addrspace(1) %out
ret void
}
-; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
- %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
store i32 %v, ptr addrspace(1) %out
ret void
}
@@ -230,8 +230,8 @@ bb:
}
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
-declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) #1
-declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) #1
+declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
+declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
declare i32 @llvm.amdgcn.permlane16.var(i32, i32, i32, i1, i1) #1
declare i32 @llvm.amdgcn.permlanex16.var(i32, i32, i32, i1, i1) #1
declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index 265d64f47bb23..b8dab361346c3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -6,13 +6,16 @@
; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
-declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1)
+declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1)
+declare float @llvm.amdgcn.permlane16.f32(float, float, i32, i32, i1, i1)
+declare i64 @llvm.amdgcn.permlane16.i64(i64, i64, i32, i32, i1, i1)
+declare double @llvm.amdgcn.permlane16.f64(double, double, i32, i32, i1, i1)
declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1)
declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.workitem.id.y()
-define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlane16_b32_vss:
+define amdgpu_kernel void @v_permlane16_b32_vss_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_vss_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -24,7 +27,7 @@ define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_permlane16_b32_vss:
+; GFX11-LABEL: v_permlane16_b32_vss_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
@@ -38,7 +41,7 @@ define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
-; GFX12-LABEL: v_permlane16_b32_vss:
+; GFX12-LABEL: v_permlane16_b32_vss_i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
@@ -51,13 +54,252 @@ define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
- %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0) {
-; GFX10-LABEL: v_permlane16_b32_vii:
+define amdgpu_kernel void @v_permlane16_b32_vss_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_vss_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_vss_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_vss_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vss_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vss_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vss_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vss_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vss_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vss_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vss_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vss_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vss_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vss_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vss_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vss_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vii_i32(ptr addrspace(1) %out, i32 %src0) {
+; GFX10-LABEL: v_permlane16_b32_vii_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
@@ -69,7 +311,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0
; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_permlane16_b32_vii:
+; GFX11-LABEL: v_permlane16_b32_vii_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
@@ -83,7 +325,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
-; GFX12-LABEL: v_permlane16_b32_vii:
+; GFX12-LABEL: v_permlane16_b32_vii_i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
@@ -94,326 +336,2057 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
- %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false)
+ %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false)
store i32 %v, ptr addrspace(1) %out
ret void
}
-; FIXME-GFX10PLUS: It is allowed to have both immediates as literals
-define amdgpu_kernel void @v_permlane16_b32_vll(ptr addrspace(1) %out, i32 %src0) {
-; GFX10-LABEL: v_permlane16_b32_vll:
+define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float %src0) {
+; GFX10-LABEL: v_permlane16_b32_vii_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT: s_movk_i32 s0, 0x1234
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1
+; GFX10-NEXT: v_permlane16_b32 v0, v0, 1, 2
; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_permlane16_b32_vll:
+; GFX11-LABEL: v_permlane16_b32_vii_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: s_movk_i32 s2, 0x1234
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
-; GFX12-LABEL: v_permlane16_b32_vll:
+; GFX12-LABEL: v_permlane16_b32_vii_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlane16_b32 v0, v0, 1, 2
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
- %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false)
- store i32 %v, ptr addrspace(1) %out
+ %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 1, i32 2, i1 false, i1 false)
+ store float %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
-; GFX10-LABEL: v_permlane16_b32_vvv:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 null, 0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1
-; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
-; GFX10-NEXT: s_endpgm
+define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vii_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vii_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-LABEL: v_permlane16_b32_vvv:
+; GFX11-SDAG-LABEL: v_permlane16_b32_vii_i64:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_clause 0x1
-; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
-; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: v_permlane16_b32_vvv:
+; GFX11-GISEL-LABEL: v_permlane16_b32_vii_i64:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
-; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
;
-; GFX12-SDAG-LABEL: v_permlane16_b32_vvv:
+; GFX12-SDAG-LABEL: v_permlane16_b32_vii_i64:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
-; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1
+; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: v_permlane16_b32_vvv:
+; GFX12-GISEL-LABEL: v_permlane16_b32_vii_i64:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
-; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %tidy = call i32 @llvm.amdgcn.workitem.id.y()
- %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
- store i32 %v, ptr addrspace(1) %out
+ %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 1, i32 2, i1 false, i1 false)
+ store i64 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) {
-; GFX10-SDAG-LABEL: v_permlane16_b32_vvs:
+define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, double %src0) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vii_f64:
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3
-; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-SDAG-NEXT: s_endpgm
;
-; GFX10-GISEL-LABEL: v_permlane16_b32_vvs:
+; GFX10-GISEL-LABEL: v_permlane16_b32_vii_f64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3
-; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-LABEL: v_permlane16_b32_vvs:
+; GFX11-SDAG-LABEL: v_permlane16_b32_vii_f64:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vii_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vii_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vii_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 1, i32 2, i1 false, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+; FIXME-GFX10PLUS: It is allowed to have both immediates as literals
+define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 %src0) {
+; GFX10-LABEL: v_permlane16_b32_vll_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_movk_i32 s0, 0x1234
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_vll_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: s_movk_i32 s2, 0x1234
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_vll_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: s_movk_i32 s2, 0x1234
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vll_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT: s_movk_i32 s2, 0x1234
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vll_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x1234
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vll_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT: s_movk_i32 s2, 0x1234
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vll_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT: s_movk_i32 s2, 0x1234
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vll_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vll_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 4660, i32 49617, i1 false, i1 false)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float %src0) {
+; GFX10-LABEL: v_permlane16_b32_vll_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_movk_i32 s0, 0x1234
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_vll_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: s_movk_i32 s2, 0x1234
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_vll_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: s_movk_i32 s2, 0x1234
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 4660, i32 49617, i1 false, i1 false)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, double %src0) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vll_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT: s_movk_i32 s2, 0x1234
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vll_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x1234
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vll_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT: s_movk_i32 s2, 0x1234
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vll_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT: s_movk_i32 s2, 0x1234
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vll_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vll_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 4660, i32 49617, i1 false, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 %src0) {
+; GFX10-LABEL: v_permlane16_b32_vvv_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 null, 0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vvv_i32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_i32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_i32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_i32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_vvv_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s5, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s4, s5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_permlane16_b32 v1, v1, s4, s5
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_vvv_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_readfirstlane_b32 s5, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: v_permlane16_b32 v0, v0, s4, s5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_permlane16_b32 v1, v1, s4, s5
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float %src0) {
+; GFX10-LABEL: v_permlane16_b32_vvv_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 null, 0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vvv_f32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_f32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_f32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_f32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_vvv_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s5, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s4, s5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_permlane16_b32 v1, v1, s4, s5
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_vvv_f64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_readfirstlane_b32 s5, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: v_permlane16_b32 v0, v0, s4, s5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_permlane16_b32 v1, v1, s4, s5
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_i32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_i32:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3
+; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_i32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_i32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_i32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_i32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 %src0, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-SDAG-NEXT: s_mov_b32 null, 0
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s2
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s2
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-GISEL-NEXT: s_mov_b32 null, 0
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s2
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s2
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f32:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3
+; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_f32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_f32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_f32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_f32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, double %src0, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-SDAG-NEXT: s_mov_b32 null, 0
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s2
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s2
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-GISEL-NEXT: s_mov_b32 null, 0
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s2
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s2
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_i32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s3, s2
+; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_i32:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4
+; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_i32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-SDAG-NEXT: s_mov_b32 null, 0
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s0
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s0
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-GISEL-NEXT: s_mov_b32 null, 0
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s0
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s0
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call i64 @llvm.amdgcn.permlane16(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_f32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s3, s2
+; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_f32:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4
+; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_f32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, double %src0, i32 %src1) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-SDAG-NEXT: s_mov_b32 null, 0
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s0
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s0
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-GISEL-NEXT: s_mov_b32 null, 0
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s0
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s0
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_fi_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_vss_fi_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,0]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_vss_fi_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_vss_fi_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_fi_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_vss_fi_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,0]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_vss_fi_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_vss_fi_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: v_permlane16_b32_vvs:
+; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_f64:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
;
-; GFX12-SDAG-LABEL: v_permlane16_b32_vvs:
+; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_f64:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: v_permlane16_b32_vvs:
+; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_f64:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
+ %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_vss_bc_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[0,1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_vss_bc_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_vss_bc_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX10-SDAG-LABEL: v_permlane16_b32_vsv:
+define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vss_bc_i64:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s3, s2
-; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-SDAG-NEXT: s_endpgm
;
-; GFX10-GISEL-LABEL: v_permlane16_b32_vsv:
+; GFX10-GISEL-LABEL: v_permlane16_b32_vss_bc_i64:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4
-; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-LABEL: v_permlane16_b32_vsv:
+; GFX11-SDAG-LABEL: v_permlane16_b32_vss_bc_i64:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: v_permlane16_b32_vsv:
+; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_i64:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
;
-; GFX12-SDAG-LABEL: v_permlane16_b32_vsv:
+; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_i64:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: v_permlane16_b32_vsv:
+; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_i64:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
- %tidy = call i32 @llvm.amdgcn.workitem.id.y()
- %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
- store i32 %v, ptr addrspace(1) %out
+ %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+ store i64 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlane16_b32_vss_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlane16_b32_vss_fi:
+define amdgpu_kernel void @v_permlane16_b32_vss_bc_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_vss_bc_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -421,11 +2394,11 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,0]
+; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[0,1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_permlane16_b32_vss_fi:
+; GFX11-LABEL: v_permlane16_b32_vss_bc_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
@@ -433,13 +2406,13 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0]
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1]
; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
-; GFX12-LABEL: v_permlane16_b32_vss_fi:
+; GFX12-LABEL: v_permlane16_b32_vss_bc_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
@@ -447,18 +2420,115 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi(ptr addrspace(1) %out, i32 %s
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0]
+; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1]
; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
- %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
- store i32 %v, ptr addrspace(1) %out
+ %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vss_bc_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vss_bc_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vss_bc_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+ store double %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlane16_b32_vss_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlane16_b32_vss_bc:
+define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_vss_fi_bc_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -466,11 +2536,11 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[0,1]
+; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_permlane16_b32_vss_bc:
+; GFX11-LABEL: v_permlane16_b32_vss_fi_bc_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
@@ -478,13 +2548,13 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1]
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1]
; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
-; GFX12-LABEL: v_permlane16_b32_vss_bc:
+; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
@@ -492,18 +2562,115 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc(ptr addrspace(1) %out, i32 %s
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1]
+; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1]
; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
- %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+ %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlane16_b32_vss_fi_bc:
+define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_vss_fi_bc_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -515,7 +2682,7 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(ptr addrspace(1) %out, i32
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_permlane16_b32_vss_fi_bc:
+; GFX11-LABEL: v_permlane16_b32_vss_fi_bc_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
@@ -529,7 +2696,7 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(ptr addrspace(1) %out, i32
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
-; GFX12-LABEL: v_permlane16_b32_vss_fi_bc:
+; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
@@ -542,8 +2709,105 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(ptr addrspace(1) %out, i32
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
- %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
- store i32 %v, ptr addrspace(1) %out
+ %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 true)
+ store double %v, ptr addrspace(1) %out
ret void
}
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index 483ea8ad57d1b..925e88d041715 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -2933,37 +2933,37 @@ define amdgpu_kernel void @update_dpp_undef_old(ptr addrspace(1) %out, i32 %in1)
; llvm.amdgcn.permlane16
; --------------------------------------------------------------------
-declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1 immarg, i1 immarg)
+declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1 immarg, i1 immarg)
define amdgpu_kernel void @permlane16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
; CHECK-LABEL: @permlane16(
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false)
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false)
; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
- %res = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ %res = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store i32 %res, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @permlane16_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
; CHECK-LABEL: @permlane16_bound_ctrl(
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true)
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16.i32(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true)
; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
- %res = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+ %res = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
store i32 %res, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @permlane16_fetch_invalid_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
; CHECK-LABEL: @permlane16_fetch_invalid_bound_ctrl(
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true)
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16.i32(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true)
; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
- %res = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
+ %res = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
store i32 %res, ptr addrspace(1) %out
ret void
}
@@ -2972,37 +2972,37 @@ define amdgpu_kernel void @permlane16_fetch_invalid_bound_ctrl(ptr addrspace(1)
; llvm.amdgcn.permlanex16
; --------------------------------------------------------------------
-declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1 immarg, i1 immarg)
+declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1 immarg, i1 immarg)
define amdgpu_kernel void @permlanex16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
; CHECK-LABEL: @permlanex16(
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false)
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false)
; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
- %res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ %res = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store i32 %res, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @permlanex16_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
; CHECK-LABEL: @permlanex16_bound_ctrl(
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true)
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true)
; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
- %res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+ %res = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
store i32 %res, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @permlanex16_fetch_invalid_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
; CHECK-LABEL: @permlanex16_fetch_invalid_bound_ctrl(
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true)
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true)
; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
- %res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
+ %res = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
store i32 %res, ptr addrspace(1) %out
ret void
}
diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
index bb370a6d1dfeb..5a061fc286935 100644
--- a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
+++ b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
@@ -513,31 +513,31 @@ define i32 @test_udot4(i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3) {
ret i32 %val
}
-declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1)
+declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1)
define i32 @test_permlane16(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 %arg4) {
; CHECK: immarg operand has non-immediate parameter
; CHECK-NEXT: i1 %arg3
- ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlane16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
- %v1 = call i32 @llvm.amdgcn.permlane16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
+ ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
+ %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
; CHECK: immarg operand has non-immediate parameter
; CHECK-NEXT: i1 %arg4
- ; CHECK-NEXT: call i32 @llvm.amdgcn.permlane16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
- %v2 = call i32 @llvm.amdgcn.permlane16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
+ ; CHECK-NEXT: call i32 @llvm.amdgcn.permlane16.i32(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
+ %v2 = call i32 @llvm.amdgcn.permlane16.i32(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
ret i32 %v2
}
-declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1)
+declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1)
define i32 @test_permlanex16(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 %arg4) {
; CHECK: immarg operand has non-immediate parameter
; CHECK-NEXT: i1 %arg3
- ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlanex16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
- %v1 = call i32 @llvm.amdgcn.permlanex16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
+ ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
+ %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
; CHECK: immarg operand has non-immediate parameter
; CHECK-NEXT: i1 %arg4
- ; CHECK-NEXT: call i32 @llvm.amdgcn.permlanex16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
- %v2 = call i32 @llvm.amdgcn.permlanex16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
+ ; CHECK-NEXT: call i32 @llvm.amdgcn.permlanex16.i32(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
+ %v2 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
ret i32 %v2
}
>From 827d209bd87738da673da40c8e7d9ce870fe9140 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Mon, 20 May 2024 07:36:01 -0400
Subject: [PATCH 03/11] fix builtin handling
---
clang/lib/CodeGen/CGBuiltin.cpp | 2 +-
clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl | 4 ++--
clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl | 2 +-
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 5 ++---
4 files changed, 6 insertions(+), 7 deletions(-)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 9ce2f5b6c103b..4184074860bcc 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18493,7 +18493,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Value *Src4 = EmitScalarExpr(E->getArg(4));
llvm::Value *Src5 = EmitScalarExpr(E->getArg(5));
- llvm::Function *F = CGM.getIntrinsic(IID, Src1->getType());
+ llvm::Function *F = CGM.getIntrinsic(IID, Src0->getType());
return Builder.CreateCall(F, {Src0, Src1, Src2, Src3, Src4, Src5});
}
case AMDGPU::BI__builtin_amdgcn_permlane64:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
index 3c40370e7f107..bc4933edeb32e 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
@@ -7,13 +7,13 @@ typedef unsigned int uint;
typedef unsigned long ulong;
// CHECK-LABEL: @test_permlane16(
-// CHECK: call i32 @llvm.amdgcn.permlane16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
+// CHECK: call i32 @llvm.amdgcn.permlane16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
void test_permlane16(global uint* out, uint a, uint b, uint c, uint d) {
*out = __builtin_amdgcn_permlane16(a, b, c, d, 0, 0);
}
// CHECK-LABEL: @test_permlanex16(
-// CHECK: call i32 @llvm.amdgcn.permlanex16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
+// CHECK: call i32 @llvm.amdgcn.permlanex16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
void test_permlanex16(global uint* out, uint a, uint b, uint c, uint d) {
*out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0);
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
index d17ff81e5d43c..2f45ef326eef2 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
@@ -35,7 +35,7 @@ void test_ds_bvh_stack_rtn(global uint2* out, uint addr, uint data, uint4 data1)
}
// CHECK-LABEL: @test_permlane64(
-// CHECK: call i32 @llvm.amdgcn.permlane64(i32 %a)
+// CHECK: call i32 @llvm.amdgcn.permlane64.i32(i32 %a)
void test_permlane64(global uint* out, uint a) {
*out = __builtin_amdgcn_permlane64(a);
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 63f7f48e82e4a..ba2582588db94 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2487,14 +2487,14 @@ def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
//===----------------------------------------------------------------------===//
// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control>
-def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">,
+def int_amdgcn_permlane16 :
Intrinsic<[llvm_any_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
-def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">,
+def int_amdgcn_permlanex16 :
Intrinsic<[llvm_any_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn,
@@ -2539,7 +2539,6 @@ def int_amdgcn_image_bvh_intersect_ray :
// llvm.amdgcn.permlane64 <src0>
def int_amdgcn_permlane64 :
- ClangBuiltin<"__builtin_amdgcn_permlane64">,
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
>From 6047848af702cc5d781442ee8fb339a9ab567e65 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Mon, 27 May 2024 06:47:09 +0000
Subject: [PATCH 04/11] Review comments
---
clang/lib/CodeGen/CGBuiltin.cpp | 9 ++++-----
llvm/docs/AMDGPUUsage.rst | 19 +++++++++++++++++++
2 files changed, 23 insertions(+), 5 deletions(-)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4184074860bcc..d9ab0051f0d23 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18481,11 +18481,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
}
case AMDGPU::BI__builtin_amdgcn_permlane16:
case AMDGPU::BI__builtin_amdgcn_permlanex16: {
- Intrinsic::ID IID;
- IID = BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
- ? Intrinsic::amdgcn_permlane16
- : Intrinsic::amdgcn_permlanex16;
-
llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
@@ -18493,6 +18488,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Value *Src4 = EmitScalarExpr(E->getArg(4));
llvm::Value *Src5 = EmitScalarExpr(E->getArg(5));
+ Intrinsic::ID IID = BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
+ ? Intrinsic::amdgcn_permlane16
+ : Intrinsic::amdgcn_permlanex16;
+
llvm::Function *F = CGM.getIntrinsic(IID, Src0->getType());
return Builder.CreateCall(F, {Src0, Src1, Src2, Src3, Src4, Src5});
}
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 51969be85648f..97e2aecd614ed 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1190,6 +1190,25 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
reduction will be performed using default iterative strategy.
Intrinsic is currently only implemented for i32.
+ llvm.amdgcn.permlane16 Provides direct access to v_permlane16_b32. Performs arbitrary gather-style
+ operation within a row (16 contiguous lanes) of the second input operand.
+ The third and fourth inputs must be scalar values. these are combined into
+ a single 64-bit value representing lane selects used to swizzle within each
+ row. Currently implemented for i16, i32, float, half, bf16, v2i16, v2f16 and
+ types whose sizes are multiples of 32-bit.
+
+ llvm.amdgcn.permlanex16 Provides direct access to v_permlanex16_b32. Performs arbitrary gather-style
+ operation across two rows of the second input operand (each row is 16 contiguous
+ lanes). The third and fourth inputs must be scalar values. these are combined
+ into a single 64-bit value representing lane selects used to swizzle within each
+ row. Currently implemented for i16, i32, float, half, bf16, v2i16, v2f16 and types
+ whose sizes are multiples of 32-bit.
+
+ llvm.amdgcn.permlane64 Provides direct access to v_permlane64_b32. Performs a specific permutation across
+ lanes of the input operand where the high half and low half of a wave64 are swapped.
+ Performs no operation in wave32 mode. Currently implemented for i16, i32, float,
+ half, bf16, v2i16, v2f16 and types whose sizes are multiples of 32-bit.
+
llvm.amdgcn.udot2 Provides direct access to v_dot2_u32_u16 across targets which
support such instructions. This performs unsigned dot product
with two v2i16 operands, summed with the third i32 operand. The
>From 8a36f078ea65a267713cfe477f62500fa1964e21 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Wed, 29 May 2024 11:54:16 +0000
Subject: [PATCH 05/11] updated test cases, added new pointer/vector tests
---
.../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 7558 +++++++++++++++--
.../AMDGPU/llvm.amdgcn.permlane.ptr.ll | 694 ++
.../CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll | 401 +-
.../AMDGPU/llvm.amdgcn.permlane64.ptr.ll | 180 +
4 files changed, 8081 insertions(+), 752 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index b8dab361346c3..1ae22c3eec185 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -6,10 +6,7 @@
; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
-declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1)
-declare float @llvm.amdgcn.permlane16.f32(float, float, i32, i32, i1, i1)
-declare i64 @llvm.amdgcn.permlane16.i64(i64, i64, i32, i32, i1, i1)
-declare double @llvm.amdgcn.permlane16.f64(double, double, i32, i32, i1, i1)
+declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1)
declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1)
declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.workitem.id.y()
@@ -1752,7 +1749,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 %
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
- %v = call i64 @llvm.amdgcn.permlane16(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
+ %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
store i64 %v, ptr addrspace(1) %out
ret void
}
@@ -2811,8 +2808,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out,
ret void
}
-define amdgpu_kernel void @v_permlanex16_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlanex16_b32_vss:
+define amdgpu_kernel void @v_permlanex16_b32_vss_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_vss_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -2824,7 +2821,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vss(ptr addrspace(1) %out, i32 %src
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_permlanex16_b32_vss:
+; GFX11-LABEL: v_permlanex16_b32_vss_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
@@ -2838,7 +2835,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vss(ptr addrspace(1) %out, i32 %src
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
-; GFX12-LABEL: v_permlanex16_b32_vss:
+; GFX12-LABEL: v_permlanex16_b32_vss_i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
@@ -2851,1112 +2848,7189 @@ define amdgpu_kernel void @v_permlanex16_b32_vss(ptr addrspace(1) %out, i32 %src
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
- %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlanex16_b32_vii(ptr addrspace(1) %out, i32 %src0) {
-; GFX10-LABEL: v_permlanex16_b32_vii:
+define amdgpu_kernel void @v_permlanex16_b32_vss_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_vss_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2
-; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_permlanex16_b32_vii:
+; GFX11-LABEL: v_permlanex16_b32_vss_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
-; GFX12-LABEL: v_permlanex16_b32_vii:
+; GFX12-LABEL: v_permlanex16_b32_vss_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_permlanex16_b32 v0, v0, 1, 2
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
- %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false)
- store i32 %v, ptr addrspace(1) %out
+ %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store float %v, ptr addrspace(1) %out
ret void
}
-; FIXME-GFX10PLUS: It is allowed to have both immediates as literals
-define amdgpu_kernel void @v_permlanex16_b32_vll(ptr addrspace(1) %out, i32 %src0) {
-; GFX10-LABEL: v_permlanex16_b32_vll:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT: s_movk_i32 s0, 0x1234
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1
-; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: v_permlanex16_b32_vll:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: s_movk_i32 s2, 0x1234
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
;
-; GFX12-LABEL: v_permlanex16_b32_vll:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
- %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false)
- store i32 %v, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
-; GFX10-LABEL: v_permlanex16_b32_vvv:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 null, 0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1
-; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
-; GFX10-NEXT: s_endpgm
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv:
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_i64:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_clause 0x1
-; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
-; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv:
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_i64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
-; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
;
-; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv:
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_i64:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
-; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv:
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_i64:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
-; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %tidy = call i32 @llvm.amdgcn.workitem.id.y()
- %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
- store i32 %v, ptr addrspace(1) %out
+ %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i64 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) {
-; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs:
+define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_f64:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3
-; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-SDAG-NEXT: s_endpgm
;
-; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs:
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_f64:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3
-; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs:
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_f64:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs:
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_f64:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
;
-; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs:
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_f64:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs:
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_f64:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
+ %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vii_i32(ptr addrspace(1) %out, i32 %src0) {
+; GFX10-LABEL: v_permlanex16_b32_vii_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vii_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vii_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, 1, 2
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv:
+define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, float %src0) {
+; GFX10-LABEL: v_permlanex16_b32_vii_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vii_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vii_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, 1, 2
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 1, i32 2, i1 false, i1 false)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_i64:
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s3, s2
-; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-SDAG-NEXT: s_endpgm
;
-; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv:
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_i64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4
-; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv:
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_i64:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv:
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_i64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
;
-; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv:
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_i64:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv:
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_i64:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
- %tidy = call i32 @llvm.amdgcn.workitem.id.y()
- %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
- store i32 %v, ptr addrspace(1) %out
+ %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 1, i32 2, i1 false, i1 false)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, double %src0) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 1, i32 2, i1 false, i1 false)
+ store double %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlanex16_b32_vss_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlanex16_b32_vss_fi:
+; FIXME-GFX10PLUS: It is allowed to have both immediates as literals
+define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32 %src0) {
+; GFX10-LABEL: v_permlanex16_b32_vll_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_movk_i32 s0, 0x1234
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,0]
-; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_permlanex16_b32_vss_fi:
+; GFX11-LABEL: v_permlanex16_b32_vll_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0]
-; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: s_movk_i32 s2, 0x1234
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
-; GFX12-LABEL: v_permlanex16_b32_vss_fi:
+; GFX12-LABEL: v_permlanex16_b32_vll_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0]
-; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: s_movk_i32 s2, 0x1234
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
- %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+ %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlanex16_b32_vss_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlanex16_b32_vss_bc:
+define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, float %src0) {
+; GFX10-LABEL: v_permlanex16_b32_vll_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_movk_i32 s0, 0x1234
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[0,1]
-; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_permlanex16_b32_vss_bc:
+; GFX11-LABEL: v_permlanex16_b32_vll_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-LABEL: v_permlanex16_b32_vss_bc:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1]
-; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
- %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
- store i32 %v, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,1]
-; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1]
-; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
- %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
- store i32 %v, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlane16_b32_tid_tid:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3
-; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: v_permlane16_b32_tid_tid:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-LABEL: v_permlane16_b32_tid_tid:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %v = call i32 @llvm.amdgcn.permlane16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
- store i32 %v, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlane16_b32_undef_tid:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3
-; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: v_permlane16_b32_undef_tid:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: s_movk_i32 s2, 0x1234
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
-; GFX12-LABEL: v_permlane16_b32_undef_tid:
+; GFX12-LABEL: v_permlanex16_b32_vll_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: s_movk_i32 s2, 0x1234
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %undef = freeze i32 poison
- %v = call i32 @llvm.amdgcn.permlane16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
- store i32 %v, ptr addrspace(1) %out
+ %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 4660, i32 49617, i1 false, i1 false)
+ store float %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid:
+define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_i64:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_clause 0x1
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3
-; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5]
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT: s_movk_i32 s2, 0x1234
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-SDAG-NEXT: s_endpgm
;
-; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid:
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_i64:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_clause 0x1
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x1234
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid:
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_i64:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_clause 0x1
-; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3
-; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT: s_movk_i32 s2, 0x1234
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid:
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_i64:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT: s_movk_i32 s2, 0x1234
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
;
-; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid:
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_i64:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3
-; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid:
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_i64:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
- store i32 %v, ptr addrspace(1) %out
+ %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 4660, i32 49617, i1 false, i1 false)
+ store i64 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlane16_b32_i_tid_fi:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: v_permlane16_b32_i_tid_fi:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, double %src0) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT: s_movk_i32 s2, 0x1234
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT: s_endpgm
;
-; GFX12-LABEL: v_permlane16_b32_i_tid_fi:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_clause 0x1
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x1234
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT: s_movk_i32 s2, 0x1234
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT: s_movk_i32 s2, 0x1234
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 4660, i32 49617, i1 false, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 %src0) {
+; GFX10-LABEL: v_permlanex16_b32_vvv_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 null, 0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv_i32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_i32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_i32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_i32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, float %src0) {
+; GFX10-LABEL: v_permlanex16_b32_vvv_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 null, 0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv_f32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_f32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_f32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_f32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vvv_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s5, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s4, s5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_permlanex16_b32 v1, v1, s4, s5
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vvv_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_readfirstlane_b32 s5, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s4, s5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_permlanex16_b32 v1, v1, s4, s5
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vvv_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s5, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s4, s5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_permlanex16_b32 v1, v1, s4, s5
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vvv_f64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_readfirstlane_b32 s5, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s4, s5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_permlanex16_b32 v1, v1, s4, s5
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_i32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_i32:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3
+; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_i32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_i32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_i32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_i32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_f32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_f32:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3
+; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_f32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_f32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_f32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_f32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vvs_i64(ptr addrspace(1) %out, i64 %src0, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-SDAG-NEXT: s_mov_b32 null, 0
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s2
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s2
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-GISEL-NEXT: s_mov_b32 null, 0
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s2
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s2
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, double %src0, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-SDAG-NEXT: s_mov_b32 null, 0
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s2
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s2
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-GISEL-NEXT: s_mov_b32 null, 0
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s2
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s2
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_i32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s3, s2
+; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_i32:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4
+; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_i32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_f32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s3, s2
+; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_f32:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4
+; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_f32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-SDAG-NEXT: s_mov_b32 null, 0
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s0
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s0
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-GISEL-NEXT: s_mov_b32 null, 0
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s0
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s0
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, double %src0, i32 %src1) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-SDAG-NEXT: s_mov_b32 null, 0
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s0
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s0
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-GISEL-NEXT: s_mov_b32 null, 0
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s0
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s0
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_vss_fi_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,0]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vss_fi_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vss_fi_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_vss_fi_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,0]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vss_fi_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vss_fi_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_vss_bc_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[0,1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vss_bc_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vss_bc_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_vss_bc_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[0,1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vss_bc_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vss_bc_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 true)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 true)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_tid_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_tid_tid_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_tid_tid_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_tid_tid_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_tid_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_tid_tid_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_tid_tid_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_tid_tid_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %v = call float @llvm.amdgcn.permlane16.f32(float %tidx_f32, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_tid_tid_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_tid_tid_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_tid_tid_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_i64 = zext i32 %tidx to i64
+ %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %tidx_i64, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_tid_tid_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_tid_tid_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_tid_tid_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %tidx_f64 = fpext float %tidx_f32 to double
+ %v = call double @llvm.amdgcn.permlane16.f64(double %tidx_f64, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_undef_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_undef_tid_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_undef_tid_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_undef_tid_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %undef = freeze i32 poison
+ %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_undef_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_undef_tid_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_undef_tid_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_undef_tid_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %undef = freeze float poison
+ %v = call float @llvm.amdgcn.permlane16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_undef_tid_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_undef_tid_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_undef_tid_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_i64 = zext i32 %tidx to i64
+ %undef = freeze i64 poison
+ %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_undef_tid_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_undef_tid_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_undef_tid_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %tidx_f64 = fpext float %tidx_f32 to double
+ %undef = freeze double poison
+ %v = call double @llvm.amdgcn.permlane16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_i32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3
+; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_i32:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_i32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3
+; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3
+; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_f32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x449a5000
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3
+; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_f32:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_f32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3
+; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3
+; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %v = call float @llvm.amdgcn.permlane16.f32(float 1234.5, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s2, s3
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3
+; GFX10-SDAG-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3
+; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s2, s3
+; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s2, s3
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3
+; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3
+; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s2, s3
+; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s2, s3
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3
+; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3
+; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s2, s3
+; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_i64 = zext i32 %tidx to i64
+ %v = call i64 @llvm.amdgcn.permlane16.i64(i64 12345, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x40934a00
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v1, s2, s3
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v0, s2, s3
+; GFX10-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x40934a00
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v0, s2, s3
+; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v1, s2, s3
+; GFX10-GISEL-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v1, s2, s3
+; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v0, s2, s3
+; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v0, s2, s3
+; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v1, s2, s3
+; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v1, s2, s3
+; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v0, s2, s3
+; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v0, s2, s3
+; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v1, s2, s3
+; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %tidx_f64 = fpext float %tidx_f32 to double
+ %v = call double @llvm.amdgcn.permlane16.f64(double 1234.5, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_i_tid_fi_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_i_tid_fi_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_i_tid_fi_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %undef = freeze i32 poison
+ %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_i_tid_fi_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_i_tid_fi_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_i_tid_fi_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %undef = freeze float poison
+ %v = call float @llvm.amdgcn.permlane16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 true, i1 false)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_i64 = zext i32 %tidx to i64
+ %undef = freeze i64 poison
+ %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 true, i1 false)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %tidx_f64 = fpext float %tidx_f32 to double
+ %undef = freeze double poison
+ %v = call double @llvm.amdgcn.permlane16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 true, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_i_tid_bc_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_i_tid_bc_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_i_tid_bc_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %undef = freeze i32 poison
+ %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_i_tid_bc_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_i_tid_bc_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_i_tid_bc_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %undef = freeze float poison
+ %v = call float @llvm.amdgcn.permlane16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 true)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_i64 = zext i32 %tidx to i64
+ %undef = freeze i64 poison
+ %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 true)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %tidx_f64 = fpext float %tidx_f32 to double
+ %undef = freeze double poison
+ %v = call double @llvm.amdgcn.permlane16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 true)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %undef = freeze i32 poison
+ %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %undef = freeze float poison
+ %v = call float @llvm.amdgcn.permlane16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 true, i1 true)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_i64 = zext i32 %tidx to i64
+ %undef = freeze i64 poison
+ %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 true, i1 true)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %tidx_f64 = fpext float %tidx_f32 to double
+ %undef = freeze double poison
+ %v = call double @llvm.amdgcn.permlane16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 true, i1 true)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_tid_tid_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_tid_tid_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_tid_tid_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_tid_tid_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_tid_tid_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_tid_tid_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %v = call float @llvm.amdgcn.permlanex16.f32(float %tidx_f32, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_i64 = zext i32 %tidx to i64
+ %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %tidx_i64, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %tidx_f64 = fpext float %tidx_f32 to double
+ %v = call double @llvm.amdgcn.permlanex16.f64(double %tidx_f64, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_undef_tid_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_undef_tid_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_undef_tid_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %undef = freeze i32 poison
+ %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_undef_tid_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_undef_tid_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_undef_tid_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %undef = freeze float poison
+ %v = call float @llvm.amdgcn.permlanex16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_i64 = zext i32 %tidx to i64
+ %undef = freeze i64 poison
+ %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %tidx_f64 = fpext float %tidx_f32 to double
+ %undef = freeze double poison
+ %v = call double @llvm.amdgcn.permlanex16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_i32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3
+; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_i32:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_i32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3
+; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3
+; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_f32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x449a5000
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3
+; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_f32:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_f32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3
+; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3
+; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %v = call float @llvm.amdgcn.permlanex16.f32(float 1234.5, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s2, s3
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3
+; GFX10-SDAG-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s2, s3
+; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s2, s3
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3
+; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s2, s3
+; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s2, s3
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3
+; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s2, s3
+; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_i64 = zext i32 %tidx to i64
+ %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 12345, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x40934a00
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v1, s2, s3
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v0, s2, s3
+; GFX10-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x40934a00
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v0, s2, s3
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v1, s2, s3
+; GFX10-GISEL-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v1, s2, s3
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v0, s2, s3
+; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v0, s2, s3
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v1, s2, s3
+; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v1, s2, s3
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v0, s2, s3
+; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v0, s2, s3
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v1, s2, s3
+; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %tidx_f64 = fpext float %tidx_f32 to double
+ %v = call double @llvm.amdgcn.permlanex16.f64(double 1234.5, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %undef = freeze i32 poison
+ %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %undef = freeze float poison
+ %v = call float @llvm.amdgcn.permlanex16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 true, i1 false)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_i64 = zext i32 %tidx to i64
+ %undef = freeze i64 poison
+ %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 true, i1 false)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %tidx_f64 = fpext float %tidx_f32 to double
+ %undef = freeze double poison
+ %v = call double @llvm.amdgcn.permlanex16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 true, i1 false)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_i_tid_bc_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_b32_i_tid_bc_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%undef = freeze i32 poison
- %v = call i32 @llvm.amdgcn.permlane16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false)
+ %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlane16_b32_i_tid_bc:
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_i_tid_bc_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_permlane16_b32_i_tid_bc:
+; GFX11-LABEL: v_permlanex16_b32_i_tid_bc_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
-; GFX12-LABEL: v_permlane16_b32_i_tid_bc:
+; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %undef = freeze i32 poison
- %v = call i32 @llvm.amdgcn.permlane16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true)
- store i32 %v, ptr addrspace(1) %out
+ %tidx_f32 = bitcast i32 %tidx to float
+ %undef = freeze float poison
+ %v = call float @llvm.amdgcn.permlanex16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 true)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_i64 = zext i32 %tidx to i64
+ %undef = freeze i64 poison
+ %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 true)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %tidx_f64 = fpext float %tidx_f32 to double
+ %undef = freeze double poison
+ %v = call double @llvm.amdgcn.permlanex16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 true)
+ store double %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc:
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc:
+; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
-; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc:
+; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%undef = freeze i32 poison
- %v = call i32 @llvm.amdgcn.permlane16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true)
+ %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlanex16_b32_tid_tid:
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_permlanex16_b32_tid_tid:
+; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
-; GFX12-LABEL: v_permlanex16_b32_tid_tid:
+; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %v = call i32 @llvm.amdgcn.permlanex16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
- store i32 %v, ptr addrspace(1) %out
+ %tidx_f32 = bitcast i32 %tidx to float
+ %undef = freeze float poison
+ %v = call float @llvm.amdgcn.permlanex16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 true, i1 true)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_i64 = zext i32 %tidx to i64
+ %undef = freeze i64 poison
+ %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 true, i1 true)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %tidx_f64 = fpext float %tidx_f32 to double
+ %undef = freeze double poison
+ %v = call double @llvm.amdgcn.permlanex16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 true, i1 true)
+ store double %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlanex16_b32_undef_tid:
+define void @v_permlane16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_half:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3
-; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-NEXT: global_store_short v[0:1], v2, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlane16_half:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_permlanex16_b32_undef_tid:
+; GFX12-LABEL: v_permlane16_half:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %v = call half @llvm.amdgcn.permlane16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store half %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlanex16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_half:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-NEXT: global_store_short v[0:1], v2, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlanex16_half:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlanex16_half:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %v = call half @llvm.amdgcn.permlanex16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store half %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlane16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_bfloat:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-NEXT: global_store_short v[0:1], v2, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlane16_bfloat:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlane16_bfloat:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %v = call bfloat @llvm.amdgcn.permlane16.f16(bfloat %src0, bfloat %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store bfloat %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlanex16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_bfloat:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-NEXT: global_store_short v[0:1], v2, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlanex16_bfloat:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlanex16_bfloat:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %v = call bfloat @llvm.amdgcn.permlanex16.f16(bfloat %src0, bfloat %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store bfloat %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlane16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-NEXT: global_store_short v[0:1], v2, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlane16_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlane16_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %v = call i16 @llvm.amdgcn.permlane16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i16 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlanex16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-NEXT: global_store_short v[0:1], v2, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlanex16_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlanex16_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %v = call i16 @llvm.amdgcn.permlanex16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i16 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlane16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlane16_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlane16_v2f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %v = call <2 x half> @llvm.amdgcn.permlane16.v2f16(<2 x half> %src0, <2 x half> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <2 x half> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlanex16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlanex16_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: v_permlanex16_b32_undef_tid:
+; GFX12-LABEL: v_permlanex16_v2f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %undef = freeze i32 poison
- %v = call i32 @llvm.amdgcn.permlanex16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
- store i32 %v, ptr addrspace(1) %out
+; GFX12-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %v = call <2 x half> @llvm.amdgcn.permlanex16.v2f16(<2 x half> %src0, <2 x half> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <2 x half> %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid:
+define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v2f32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_clause 0x1
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3
-; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5]
-; GFX10-SDAG-NEXT: s_endpgm
-;
-; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlane16_v2f32:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_clause 0x1
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
-; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
-; GFX10-GISEL-NEXT: s_endpgm
-;
-; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v4
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v2f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_clause 0x1
-; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
-; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v4
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v5
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3
-; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
-; GFX11-SDAG-NEXT: s_nop 0
-; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-SDAG-NEXT: s_endpgm
+; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid:
+; GFX11-GISEL-LABEL: v_permlane16_v2f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v4
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v5
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT: s_nop 0
-; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-GISEL-NEXT: s_endpgm
+; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid:
+; GFX12-SDAG-LABEL: v_permlane16_v2f32:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3
-; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
-; GFX12-SDAG-NEXT: s_nop 0
-; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-NEXT: s_endpgm
+; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid:
+; GFX12-GISEL-LABEL: v_permlane16_v2f32:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v4
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v5
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12-GISEL-NEXT: s_nop 0
-; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-NEXT: s_endpgm
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
- store i32 %v, ptr addrspace(1) %out
+; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <2 x float> @llvm.amdgcn.permlane16.v2f32(<2 x float> %src0, <2 x float> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <2 x float> %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlanex16_b32_i_tid_fi:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT: s_endpgm
+define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_v2f32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlanex16_v2f32:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v4
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_v2f32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v4
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_permlanex16_b32_i_tid_fi:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-GISEL-LABEL: v_permlanex16_v2f32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v4
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: v_permlanex16_b32_i_tid_fi:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %undef = freeze i32 poison
- %v = call i32 @llvm.amdgcn.permlanex16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false)
- store i32 %v, ptr addrspace(1) %out
+; GFX12-SDAG-LABEL: v_permlanex16_v2f32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlanex16_v2f32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v4
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v5
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <2 x float> @llvm.amdgcn.permlanex16.v2f32(<2 x float> %src0, <2 x float> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <2 x float> %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlanex16_b32_i_tid_bc:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT: s_endpgm
+define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v7i32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v9
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v10
+; GFX10-SDAG-NEXT: v_permlane16_b32 v8, v8, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v7, v7, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v6, v6, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlane16_v7i32:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v9
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v10
+; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v6, v6, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v7, v7, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v8, v8, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v7i32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v9
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v10
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_permlanex16_b32_i_tid_bc:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-GISEL-LABEL: v_permlane16_v7i32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v9
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v10
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: v_permlanex16_b32_i_tid_bc:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %undef = freeze i32 poison
- %v = call i32 @llvm.amdgcn.permlanex16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true)
- store i32 %v, ptr addrspace(1) %out
+; GFX12-SDAG-LABEL: v_permlane16_v7i32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v9
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v10
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlane16_v7i32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v9
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v10
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <7 x i32> @llvm.amdgcn.permlane16.v7i32(<7 x i32> %src0, <7 x i32> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <7 x i32> %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT: s_endpgm
+define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_v7i32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v9
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v10
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v8, v8, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v7, v7, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v6, v6, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlanex16_v7i32:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v9
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v10
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v6, v6, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v7, v7, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v8, v8, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_v7i32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v9
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v10
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11-GISEL-LABEL: v_permlanex16_v7i32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v9
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v10
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
- %tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %undef = freeze i32 poison
- %v = call i32 @llvm.amdgcn.permlanex16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true)
- store i32 %v, ptr addrspace(1) %out
+; GFX12-SDAG-LABEL: v_permlanex16_v7i32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v9
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v10
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlanex16_v7i32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v9
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v10
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <7 x i32> @llvm.amdgcn.permlanex16.v7i32(<7 x i32> %src0, <7 x i32> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <7 x i32> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v8i16:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v6
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v7
+; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlane16_v8i16:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v6
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v7
+; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v8i16:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v6
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v7
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlane16_v8i16:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v6
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v8i16:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlane16_v8i16:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <8 x i16> @llvm.amdgcn.permlane16.v8i16(<8 x i16> %src0, <8 x i16> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <8 x i16> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_v8i16:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v6
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v7
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlanex16_v8i16:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v6
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v7
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_v8i16:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v6
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v7
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlanex16_v8i16:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v6
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_v8i16:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlanex16_v8i16:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <8 x i16> @llvm.amdgcn.permlanex16.v8i16(<8 x i16> %src0, <8 x i16> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <8 x i16> %v, ptr addrspace(1) %out
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll
new file mode 100644
index 0000000000000..bb42834221681
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll
@@ -0,0 +1,694 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-SDAG %s
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG %s
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s
+
+define void @v_permlane16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_p0:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_p0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v4
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_p0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %v = call ptr @llvm.amdgcn.permlane16.p0(ptr %src0, ptr %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store ptr %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlanex16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_p0:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_p0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v4
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_p0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %v = call ptr @llvm.amdgcn.permlanex16.p0(ptr %src0, ptr %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store ptr %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlane16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v3p0:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v8
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v9
+; GFX10-SDAG-NEXT: v_permlane16_b32 v7, v7, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v6, v6, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v3p0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v8
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v9
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v3p0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %v = call <3 x ptr> @llvm.amdgcn.permlane16.v3p0(<3 x ptr> %src0, <3 x ptr> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <3 x ptr> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlanex16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_v3p0:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v8
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v9
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v7, v7, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v6, v6, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_v3p0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v8
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v9
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_v3p0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %v = call <3 x ptr> @llvm.amdgcn.permlanex16.v3p0(<3 x ptr> %src0, <3 x ptr> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <3 x ptr> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlane16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_p3:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_p3:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_p3:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %v = call ptr addrspace(3) @llvm.amdgcn.permlane16.p3(ptr addrspace(3) %src0, ptr addrspace(3) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store ptr addrspace(3) %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlanex16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_p3:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_p3:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_p3:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %v = call ptr addrspace(3) @llvm.amdgcn.permlanex16.p3(ptr addrspace(3) %src0, ptr addrspace(3) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store ptr addrspace(3) %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlane16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v3p3:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v6
+; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[2:4], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v3p3:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v5
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v3p3:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlane16.v3p3(<3 x ptr addrspace(3)> %src0, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlanex16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_v3p3:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v6
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[2:4], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_v3p3:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v5
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_v3p3:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlanex16.v3p3(<3 x ptr addrspace(3)> %src0, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlane16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_p5:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_p5:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_p5:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %v = call ptr addrspace(5) @llvm.amdgcn.permlane16.p5(ptr addrspace(5) %src0, ptr addrspace(5) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store ptr addrspace(5) %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlanex16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_p5:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_p5:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_p5:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %v = call ptr addrspace(5) @llvm.amdgcn.permlanex16.p5(ptr addrspace(5) %src0, ptr addrspace(5) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store ptr addrspace(5) %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlane16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v3p5:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v6
+; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[2:4], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v3p5:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v5
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v3p5:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlane16.v3p5(<3 x ptr addrspace(5)> %src0, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlanex16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_v3p5:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v6
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[2:4], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_v3p5:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v5
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_v3p5:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlanex16.v3p5(<3 x ptr addrspace(5)> %src0, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlane16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_p6:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_p6:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_p6:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %v = call ptr addrspace(6) @llvm.amdgcn.permlane16.p6(ptr addrspace(6) %src0, ptr addrspace(6) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store ptr addrspace(6) %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlanex16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_p6:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_p6:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_p6:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %v = call ptr addrspace(6) @llvm.amdgcn.permlanex16.p6(ptr addrspace(6) %src0, ptr addrspace(6) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store ptr addrspace(6) %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlane16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v3p6:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v6
+; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[2:4], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v3p6:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v5
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v3p6:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlane16.v3p6(<3 x ptr addrspace(6)> %src0, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlanex16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlanex16_v3p6:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v5
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v6
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[2:4], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlanex16_v3p6:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v5
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlanex16_v3p6:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlanex16.v3p6(<3 x ptr addrspace(6)> %src0, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
index b81cb97725648..f653baa7365c7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
@@ -5,8 +5,8 @@
declare i32 @llvm.amdgcn.permlane64(i32)
declare i32 @llvm.amdgcn.workitem.id.x()
-define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) {
-; GFX11-LABEL: test_s:
+define amdgpu_kernel void @test_s_i32(ptr addrspace(1) %out, i32 %src0) {
+; GFX11-LABEL: test_s_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
@@ -19,13 +19,98 @@ define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) {
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
- %v = call i32 @llvm.amdgcn.permlane64(i32 %src0)
+ %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %src0)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @test_i(ptr addrspace(1) %out) {
-; GFX11-LABEL: test_i:
+define amdgpu_kernel void @test_s_f32(ptr addrspace(1) %out, float %src0) {
+; GFX11-LABEL: test_s_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlane64_b32 v0, v0
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %v = call float @llvm.amdgcn.permlane64.f32(float %src0)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_s_i64(ptr addrspace(1) %out, i64 %src0) {
+; GFX11-SDAG-LABEL: test_s_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2
+; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_s_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+ %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %src0)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_s_f64(ptr addrspace(1) %out, double %src0) {
+; GFX11-SDAG-LABEL: test_s_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2
+; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_s_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+ %v = call double @llvm.amdgcn.permlane64.f64(double %src0)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_i_i32(ptr addrspace(1) %out) {
+; GFX11-LABEL: test_i_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0
@@ -36,13 +121,95 @@ define amdgpu_kernel void @test_i(ptr addrspace(1) %out) {
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
- %v = call i32 @llvm.amdgcn.permlane64(i32 99)
+ %v = call i32 @llvm.amdgcn.permlane64.i32(i32 99)
store i32 %v, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 {
-; GFX11-SDAG-LABEL: test_v:
+define amdgpu_kernel void @test_i_f32(ptr addrspace(1) %out) {
+; GFX11-LABEL: test_i_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0x449a5000 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlane64_b32 v0, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %v = call float @llvm.amdgcn.permlane64.f32(float 1234.5)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_i_i64(ptr addrspace(1) %out) {
+; GFX11-SDAG-LABEL: test_i_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v2
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_i_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x63
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v2
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+ %v = call i64 @llvm.amdgcn.permlane64.i64(i64 99)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_i_f64(ptr addrspace(1) %out) {
+; GFX11-SDAG-LABEL: test_i_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x40934a00
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_i_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x40934a00
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v2
+; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+ %v = call double @llvm.amdgcn.permlane64.f64(double 1234.5)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_v_i32(ptr addrspace(1) %out, i32 %src0) #1 {
+; GFX11-SDAG-LABEL: test_v_i32:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0
@@ -53,7 +220,7 @@ define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 {
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: test_v:
+; GFX11-GISEL-LABEL: test_v_i32:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
@@ -64,7 +231,221 @@ define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 {
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
- %v = call i32 @llvm.amdgcn.permlane64(i32 %tidx)
+ %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %tidx)
store i32 %v, ptr addrspace(1) %out
ret void
}
+
+define amdgpu_kernel void @test_v_f32(ptr addrspace(1) %out, float %src0) #1 {
+; GFX11-SDAG-LABEL: test_v_f32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_v_f32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %v = call float @llvm.amdgcn.permlane64.f32(float %tidx_f32)
+ store float %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_v_i64(ptr addrspace(1) %out, i64 %src0) #1 {
+; GFX11-LABEL: test_v_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: v_permlane64_b32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_permlane64_b32 v1, v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_i64 = zext i32 %tidx to i64
+ %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %tidx_i64)
+ store i64 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_v_f64(ptr addrspace(1) %out, double %src0) #1 {
+; GFX11-SDAG-LABEL: test_v_f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_v_f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidx_f32 = bitcast i32 %tidx to float
+ %tidx_f64 = fpext float %tidx_f32 to double
+ %v = call double @llvm.amdgcn.permlane64.f64(double %tidx_f64)
+ store double %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_half(ptr addrspace(1) %out, half %src0) {
+; GFX11-LABEL: test_half:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_permlane64_b32 v2, v2
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %v = call half @llvm.amdgcn.permlane64.f16(half %src0)
+ store half %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_bfloat(ptr addrspace(1) %out, bfloat %src0) {
+; GFX11-LABEL: test_bfloat:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_permlane64_b32 v2, v2
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %v = call bfloat @llvm.amdgcn.permlane64.bf16(bfloat %src0)
+ store bfloat %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_i16(ptr addrspace(1) %out, i16 %src0) {
+; GFX11-LABEL: test_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_permlane64_b32 v2, v2
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %v = call i16 @llvm.amdgcn.permlane64.i16(i16 %src0)
+ store i16 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_v2f16(ptr addrspace(1) %out, <2 x half> %src0) {
+; GFX11-LABEL: test_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_permlane64_b32 v2, v2
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %v = call <2 x half> @llvm.amdgcn.permlane64.v2f16(<2 x half> %src0)
+ store <2 x half> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_v2f32(ptr addrspace(1) %out, <2 x float> %src0) {
+; GFX11-SDAG-LABEL: test_v2f32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3
+; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
+; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_v2f32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
+; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3
+; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <2 x float> @llvm.amdgcn.permlane64.v2f32(<2 x float> %src0)
+ store <2 x float> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_v7i32(ptr addrspace(1) %out, <7 x i32> %src0) {
+; GFX11-SDAG-LABEL: test_v7i32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v8, v8
+; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7
+; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6
+; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5
+; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4
+; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3
+; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_v7i32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
+; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3
+; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4
+; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5
+; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6
+; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7
+; GFX11-GISEL-NEXT: v_permlane64_b32 v8, v8
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <7 x i32> @llvm.amdgcn.permlane64.v7i32(<7 x i32> %src0)
+ store <7 x i32> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_v8i16(ptr addrspace(1) %out, <8 x i16> %src0) {
+; GFX11-SDAG-LABEL: test_v8i16:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5
+; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4
+; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3
+; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_v8i16:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
+; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3
+; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4
+; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <8 x i16> @llvm.amdgcn.permlane64.v8i16(<8 x i16> %src0)
+ store <8 x i16> %v, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
new file mode 100644
index 0000000000000..2070a832e0fcd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
@@ -0,0 +1,180 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG %s
+
+define amdgpu_kernel void @test_p0(ptr addrspace(1) %out, ptr %src0) {
+; GFX11-SDAG-LABEL: test_p0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2
+; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+ %v = call ptr @llvm.amdgcn.permlane64.p0(ptr %src0)
+ store ptr %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0) {
+; GFX11-SDAG-LABEL: test_v3p0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x2
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x44
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x54
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s6
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v8, s2
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v1
+; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v4
+; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5
+; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v8
+; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v0
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v7
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16
+; GFX11-SDAG-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+ %v = call <3 x ptr> @llvm.amdgcn.permlane64.v3p0(<3 x ptr> %src0)
+ store <3 x ptr> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0) {
+; GFX11-SDAG-LABEL: test_p3:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+ %v = call ptr addrspace(3) @llvm.amdgcn.permlane64.v3p0(ptr addrspace(3) %src0)
+ store ptr addrspace(3) %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src0) {
+; GFX11-SDAG-LABEL: test_v3p3:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0
+; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3
+; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+ %v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlane64.v3p3(<3 x ptr addrspace(3)> %src0)
+ store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0) {
+; GFX11-SDAG-LABEL: test_p5:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+ %v = call ptr addrspace(5) @llvm.amdgcn.permlane64.p5(ptr addrspace(5) %src0)
+ store ptr addrspace(5) %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src0) {
+; GFX11-SDAG-LABEL: test_v3p5:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0
+; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3
+; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+ %v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlane64.v3p5(<3 x ptr addrspace(5)> %src0)
+ store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0) {
+; GFX11-SDAG-LABEL: test_p6:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+ %v = call ptr addrspace(6) @llvm.amdgcn.permlane64.p6(ptr addrspace(6) %src0)
+ store ptr addrspace(6) %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src0) {
+; GFX11-SDAG-LABEL: test_v3p6:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0
+; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3
+; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1]
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+ %v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlane64.v3p6(<3 x ptr addrspace(6)> %src0)
+ store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out
+ ret void
+}
>From 5a4c4c4dfbcbe85746947a6cfefa8dd48d793bd2 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Fri, 31 May 2024 08:11:42 +0000
Subject: [PATCH 06/11] Take over recent changes from original patch
---
llvm/docs/AMDGPUUsage.rst | 13 +-
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 186 +++---------
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 109 +++++--
.../AMDGPU/llvm.amdgcn.readfirstlane.ll | 58 ----
.../AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll | 34 +++
.../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 66 -----
.../AMDGPU/llvm.amdgcn.readlane.ptr.ll | 38 +++
.../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 277 ------------------
.../AMDGPU/llvm.amdgcn.writelane.ptr.ll | 144 +++++++++
9 files changed, 347 insertions(+), 578 deletions(-)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 97e2aecd614ed..2d24d9e26dbed 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1194,20 +1194,21 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
operation within a row (16 contiguous lanes) of the second input operand.
The third and fourth inputs must be scalar values. these are combined into
a single 64-bit value representing lane selects used to swizzle within each
- row. Currently implemented for i16, i32, float, half, bf16, v2i16, v2f16 and
- types whose sizes are multiples of 32-bit.
+ row. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>,
+ <2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the 32-bit vectors.
llvm.amdgcn.permlanex16 Provides direct access to v_permlanex16_b32. Performs arbitrary gather-style
operation across two rows of the second input operand (each row is 16 contiguous
lanes). The third and fourth inputs must be scalar values. these are combined
into a single 64-bit value representing lane selects used to swizzle within each
- row. Currently implemented for i16, i32, float, half, bf16, v2i16, v2f16 and types
- whose sizes are multiples of 32-bit.
+ row. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>,
+ <2 x bfloat>, i64, double, pointers, multiples of the 32-bit vectors.
llvm.amdgcn.permlane64 Provides direct access to v_permlane64_b32. Performs a specific permutation across
lanes of the input operand where the high half and low half of a wave64 are swapped.
- Performs no operation in wave32 mode. Currently implemented for i16, i32, float,
- half, bf16, v2i16, v2f16 and types whose sizes are multiples of 32-bit.
+ Performs no operation in wave32 mode. Currently implemented for i16, i32, float, half,
+ bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the
+ 32-bit vectors.
llvm.amdgcn.udot2 Provides direct access to v_dot2_u32_u16 across targets which
support such instructions. This performs unsigned dot product
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index b28c3521d6336..aa0ea06c6092a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5394,15 +5394,12 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
MachineIRBuilder &B = Helper.MIRBuilder;
MachineRegisterInfo &MRI = *B.getMRI();
- Register DstReg = MI.getOperand(0).getReg();
- Register Src0 = MI.getOperand(2).getReg();
-
bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
IID == Intrinsic::amdgcn_permlanex16;
- auto createLaneOp = [&](Register Src0, Register Src1,
- Register Src2) -> Register {
- auto LaneOp = B.buildIntrinsic(IID, {S32}).addUse(Src0);
+ auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
+ Register Src2, LLT VT) -> Register {
+ auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
switch (IID) {
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_permlane64:
@@ -5428,6 +5425,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
}
};
+ Register DstReg = MI.getOperand(0).getReg();
+ Register Src0 = MI.getOperand(2).getReg();
Register Src1, Src2;
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
IsPermLane16) {
@@ -5446,156 +5445,65 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
}
if (Size < 32) {
- Register Src0Cast = MRI.getType(Src0).isScalar()
- ? Src0
- : B.buildBitcast(LLT::scalar(Size), Src0).getReg(0);
- Src0 = B.buildAnyExt(S32, Src0Cast).getReg(0);
-
- if (IsPermLane16) {
- Register Src1Cast =
- MRI.getType(Src1).isScalar()
- ? Src1
- : B.buildBitcast(LLT::scalar(Size), Src2).getReg(0);
- Src1 = B.buildAnyExt(LLT::scalar(32), Src1Cast).getReg(0);
- }
+ Src0 = B.buildAnyExt(S32, Src0).getReg(0);
- if (IID == Intrinsic::amdgcn_writelane) {
- Register Src2Cast =
- MRI.getType(Src2).isScalar()
- ? Src2
- : B.buildBitcast(LLT::scalar(Size), Src2).getReg(0);
- Src2 = B.buildAnyExt(LLT::scalar(32), Src2Cast).getReg(0);
- }
+ if (IsPermLane16)
+ Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
- Register LaneOpDst = createLaneOp(Src0, Src1, Src2);
- if (Ty.isScalar())
- B.buildTrunc(DstReg, LaneOpDst);
- else {
- auto Trunc = B.buildTrunc(LLT::scalar(Size), LaneOpDst);
- B.buildBitcast(DstReg, Trunc);
- }
+ if (IID == Intrinsic::amdgcn_writelane)
+ Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
+ Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
+ B.buildTrunc(DstReg, LaneOpDst);
MI.eraseFromParent();
return true;
}
- if ((Size % 32) == 0) {
- SmallVector<Register, 2> PartialRes;
- unsigned NumParts = Size / 32;
- auto IsS16Vec = Ty.isVector() && Ty.getElementType() == S16;
- MachineInstrBuilder Src0Parts;
-
- if (Ty.isPointer()) {
- auto PtrToInt = B.buildPtrToInt(LLT::scalar(Size), Src0);
- Src0Parts = B.buildUnmerge(S32, PtrToInt);
- } else if (Ty.isPointerVector()) {
- LLT IntVecTy = Ty.changeElementType(
- LLT::scalar(Ty.getElementType().getSizeInBits()));
- auto PtrToInt = B.buildPtrToInt(IntVecTy, Src0);
- Src0Parts = B.buildUnmerge(S32, PtrToInt);
- } else
- Src0Parts =
- IsS16Vec ? B.buildUnmerge(V2S16, Src0) : B.buildUnmerge(S32, Src0);
+ if (Size % 32 != 0)
+ return false;
- switch (IID) {
- case Intrinsic::amdgcn_readlane: {
- Register Src1 = MI.getOperand(3).getReg();
- for (unsigned i = 0; i < NumParts; ++i) {
- Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
- : Src0Parts.getReg(i);
- PartialRes.push_back(
- (B.buildIntrinsic(Intrinsic::amdgcn_readlane, {S32})
- .addUse(Src0)
- .addUse(Src1))
- .getReg(0));
- }
+ LLT PartialResTy = S32;
+ if (Ty.isVector()) {
+ LLT EltTy = Ty.getElementType();
+ switch (EltTy.getSizeInBits()) {
+ case 16:
+ PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
break;
- }
- case Intrinsic::amdgcn_readfirstlane:
- case Intrinsic::amdgcn_permlane64: {
- for (unsigned i = 0; i < NumParts; ++i) {
- Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
- : Src0Parts.getReg(i);
- PartialRes.push_back(
- (B.buildIntrinsic(IID, {S32}).addUse(Src0).getReg(0)));
- }
-
+ case 32:
+ PartialResTy = EltTy;
break;
- }
- case Intrinsic::amdgcn_writelane:
- case Intrinsic::amdgcn_permlane16:
- case Intrinsic::amdgcn_permlanex16: {
- Register Src1 = MI.getOperand(3).getReg();
- Register Src2 = MI.getOperand(4).getReg();
-
- Register SrcX = IsPermLane16 ? Src1 : Src2;
- MachineInstrBuilder SrcXParts;
-
- if (Ty.isPointer()) {
- auto PtrToInt = B.buildPtrToInt(S64, SrcX);
- SrcXParts = B.buildUnmerge(S32, PtrToInt);
- } else if (Ty.isPointerVector()) {
- LLT IntVecTy = Ty.changeElementType(
- LLT::scalar(Ty.getElementType().getSizeInBits()));
- auto PtrToInt = B.buildPtrToInt(IntVecTy, SrcX);
- SrcXParts = B.buildUnmerge(S32, PtrToInt);
- } else
- SrcXParts =
- IsS16Vec ? B.buildUnmerge(V2S16, SrcX) : B.buildUnmerge(S32, SrcX);
-
- for (unsigned i = 0; i < NumParts; ++i) {
- Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
- : Src0Parts.getReg(i);
- SrcX = IsS16Vec ? B.buildBitcast(S32, SrcXParts.getReg(i)).getReg(0)
- : SrcXParts.getReg(i);
- PartialRes.push_back(IsPermLane16 ? createLaneOp(Src0, SrcX, Src2)
- : createLaneOp(Src0, Src1, SrcX));
- }
-
+ default:
+ // Handle all other cases via S32 pieces;
break;
}
- }
+ }
- if (Ty.isPointerVector()) {
- unsigned PtrSize = Ty.getElementType().getSizeInBits();
- SmallVector<Register, 2> PtrElements;
- if (PtrSize == 32) {
- // Handle 32 bit pointers
- for (unsigned i = 0; i < NumParts; i++)
- PtrElements.push_back(
- B.buildIntToPtr(Ty.getElementType(), PartialRes[i]).getReg(0));
- } else {
- // Handle legalization of <? x [pointer type bigger than 32 bits]>
- SmallVector<Register, 2> PtrParts;
- unsigned NumS32Parts = PtrSize / 32;
- unsigned PartIdx = 0;
- for (unsigned i = 0, j = 1; i < NumParts; i += NumS32Parts, j++) {
- // Merge S32 components of a pointer element first.
- for (; PartIdx < (j * NumS32Parts); PartIdx++)
- PtrParts.push_back(PartialRes[PartIdx]);
-
- auto MergedPtr =
- B.buildMergeLikeInstr(LLT::scalar(PtrSize), PtrParts);
- PtrElements.push_back(
- B.buildIntToPtr(Ty.getElementType(), MergedPtr).getReg(0));
- PtrParts.clear();
- }
- }
+ SmallVector<Register, 2> PartialRes;
+ unsigned NumParts = Size / 32;
+ MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
+ MachineInstrBuilder Src1Parts, Src2Parts;
- B.buildMergeLikeInstr(DstReg, PtrElements);
- } else {
- if (IsS16Vec) {
- for (unsigned i = 0; i < NumParts; i++)
- PartialRes[i] = B.buildBitcast(V2S16, PartialRes[i]).getReg(0);
- }
- B.buildMergeLikeInstr(DstReg, PartialRes);
- }
+ if (IsPermLane16)
+ Src1Parts = B.buildUnmerge(PartialResTy, Src1);
- MI.eraseFromParent();
- return true;
+ if (IID == Intrinsic::amdgcn_writelane)
+ Src2Parts = B.buildUnmerge(PartialResTy, Src2);
+
+ for (unsigned i = 0; i < NumParts; ++i) {
+ Src0 = Src0Parts.getReg(i);
+
+ if (IsPermLane16)
+ Src1 = Src1Parts.getReg(i);
+
+ if (IID == Intrinsic::amdgcn_writelane)
+ Src2 = Src2Parts.getReg(i);
+
+ PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
}
- return false;
+ B.buildMergeLikeInstr(DstReg, PartialRes);
+ MI.eraseFromParent();
+ return true;
}
bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5d34ed089f65d..acd290cc36441 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6093,30 +6093,36 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
unsigned IntrinsicID = N->getConstantOperandVal(0);
bool IsPermLane16 = IntrinsicID == Intrinsic::amdgcn_permlane16 ||
IntrinsicID == Intrinsic::amdgcn_permlanex16;
- bool IsPermLane64 = IntrinsicID == Intrinsic::amdgcn_permlane64;
SDValue Src0 = N->getOperand(1);
SDLoc SL(N);
MVT IntVT = MVT::getIntegerVT(ValSize);
- auto createLaneOp = [&](SDValue Src0, SDValue Src1, SDValue Src2,
- MVT ValueT) -> SDValue {
- if (IsPermLane16 || IsPermLane64) {
- if (IsPermLane16) {
- SDValue Src3 = N->getOperand(4);
- SDValue Src4 = N->getOperand(5);
- SDValue Src5 = N->getOperand(6);
- return DAG.getNode(IntrinsicID == Intrinsic::amdgcn_permlane16
- ? AMDGPUISD::PERMLANE16
- : AMDGPUISD::PERMLANEX16,
- SL, ValueT, {Src0, Src1, Src2, Src3, Src4, Src5});
- }
- return DAG.getNode(AMDGPUISD::PERMLANE64, SL, ValueT, {Src0});
+ auto createLaneOp = [&DAG, &SL, N](SDValue Src0, SDValue Src1, SDValue Src2,
+ MVT ValueT) -> SDValue {
+ switch (unsigned IID = N->getConstantOperandVal(0)) {
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_permlane64:
+ return DAG.getNode(IID == Intrinsic::amdgcn_readfirstlane
+ ? AMDGPUISD::READFIRSTLANE
+ : AMDGPUISD::PERMLANE64,
+ SL, ValueT, {Src0});
+ case Intrinsic::amdgcn_readlane:
+ return DAG.getNode(AMDGPUISD::READLANE, SL, ValueT, {Src0, Src1});
+ case Intrinsic::amdgcn_writelane:
+ return DAG.getNode(AMDGPUISD::WRITELANE, SL, ValueT, {Src0, Src1, Src2});
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16: {
+ SDValue Src3 = N->getOperand(4);
+ SDValue Src4 = N->getOperand(5);
+ SDValue Src5 = N->getOperand(6);
+ return DAG.getNode(IID == Intrinsic::amdgcn_permlane16
+ ? AMDGPUISD::PERMLANE16
+ : AMDGPUISD::PERMLANEX16,
+ SL, ValueT, {Src0, Src1, Src2, Src3, Src4, Src5});
+ }
+ default:
+ llvm_unreachable("unhandled lane op");
}
-
- return (
- Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, ValueT, {Src0, Src1, Src2})
- : Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, ValueT, {Src0, Src1})
- : DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, ValueT, {Src0}));
};
SDValue Src1, Src2;
@@ -6133,31 +6139,73 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
}
if (ValSize < 32) {
- SDValue InitBitCast = DAG.getBitcast(IntVT, Src0);
- Src0 = DAG.getAnyExtOrTrunc(InitBitCast, SL, MVT::i32);
+ bool IsFloat = VT.isFloatingPoint();
+ Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
+ SL, MVT::i32);
if (IsPermLane16) {
- SDValue Src1Cast = DAG.getBitcast(IntVT, Src1);
- Src1 = DAG.getAnyExtOrTrunc(Src1Cast, SL, MVT::i32);
+ Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
+ SL, MVT::i32);
}
if (IntrinsicID == Intrinsic::amdgcn_writelane) {
- SDValue Src2Cast = DAG.getBitcast(IntVT, Src2);
- Src2 = DAG.getAnyExtOrTrunc(Src2Cast, SL, MVT::i32);
+ Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
+ SL, MVT::i32);
}
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
- return DAG.getBitcast(VT, Trunc);
+ return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
+ }
+
+ if (ValSize % 32 != 0)
+ return SDValue();
+
+ if (VT.isVector()) {
+ switch (MVT::SimpleValueType EltTy =
+ VT.getVectorElementType().getSimpleVT().SimpleTy) {
+ case MVT::i32:
+ case MVT::f32: {
+ SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
+ return DAG.UnrollVectorOp(LaneOp.getNode());
+ }
+ case MVT::i16:
+ case MVT::f16:
+ case MVT::bf16: {
+ MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
+ SmallVector<SDValue, 4> Pieces;
+ SDValue Src0SubVec, Src1SubVec, Src2SubVec;
+ for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
+ Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
+ DAG.getConstant(EltIdx, SL, MVT::i32));
+
+ if (IsPermLane16)
+ Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
+ DAG.getConstant(EltIdx, SL, MVT::i32));
+
+ if (IntrinsicID == Intrinsic::amdgcn_writelane)
+ Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
+ DAG.getConstant(EltIdx, SL, MVT::i32));
+
+ Pieces.push_back(
+ IsPermLane16
+ ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
+ : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
+ EltIdx += 2;
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
+ }
+ default:
+ // Handle all other cases by bitcasting to i32 vectors
+ break;
+ }
}
- if ((ValSize % 32) == 0) {
MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
Src0 = DAG.getBitcast(VecVT, Src0);
- if (IsPermLane16) {
+ if (IsPermLane16)
Src1 = DAG.getBitcast(VecVT, Src1);
- }
if (IntrinsicID == Intrinsic::amdgcn_writelane)
Src2 = DAG.getBitcast(VecVT, Src2);
@@ -6165,9 +6213,6 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
SDValue UnrolledLaneOp = DAG.UnrollVectorOp(LaneOp.getNode());
return DAG.getBitcast(VT, UnrolledLaneOp);
- }
-
- return SDValue();
}
void SITargetLowering::ReplaceNodeResults(SDNode *N,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 732489f22c36f..ed0da0d2a61a2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -566,64 +566,6 @@ define void @test_readfirstlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src) {
ret void
}
-define void @test_readfirstlane_p0(ptr addrspace(1) %out, ptr %src) {
-; CHECK-SDAG-LABEL: test_readfirstlane_p0:
-; CHECK-SDAG: ; %bb.0:
-; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:5]
-; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; CHECK-GISEL-LABEL: test_readfirstlane_p0:
-; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:5]
-; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
- %x = call ptr @llvm.amdgcn.readfirstlane.p0(ptr %src)
- call void asm sideeffect "; use $0", "s"(ptr %x)
- ret void
-}
-
-define void @test_readfirstlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src) {
-; CHECK-SDAG-LABEL: test_readfirstlane_v3p0:
-; CHECK-SDAG: ; %bb.0:
-; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:9]
-; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; CHECK-GISEL-LABEL: test_readfirstlane_v3p0:
-; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:9]
-; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
- %x = call <3 x ptr> @llvm.amdgcn.readfirstlane.v3p0(<3 x ptr> %src)
- call void asm sideeffect "; use $0", "s"(<3 x ptr> %x)
- ret void
-}
-
define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v8i16:
; CHECK-SDAG: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
index 588f239606f52..3882a5f0f9f4f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
@@ -1,6 +1,40 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
+define void @test_readfirstlane_p0(ptr addrspace(1) %out, ptr %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_p0:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:5]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call ptr @llvm.amdgcn.readfirstlane.p0(ptr %src)
+ call void asm sideeffect "; use $0", "s"(ptr %x)
+ ret void
+}
+
+define void @test_readfirstlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3p0:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:9]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call <3 x ptr> @llvm.amdgcn.readfirstlane.v3p0(<3 x ptr> %src)
+ call void asm sideeffect "; use $0", "s"(<3 x ptr> %x)
+ ret void
+}
+
define void @test_readfirstlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_p3:
; CHECK-SDAG: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 71cd3db81addd..325a39abb588a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -862,72 +862,6 @@ define void @test_readlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src
ret void
}
-define void @test_readlane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
-; CHECK-SDAG-LABEL: test_readlane_p0:
-; CHECK-SDAG: ; %bb.0:
-; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v4
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
-; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:5]
-; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; CHECK-GISEL-LABEL: test_readlane_p0:
-; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v4
-; CHECK-GISEL-NEXT: s_nop 3
-; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s5
-; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s5
-; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:5]
-; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
- %x = call ptr @llvm.amdgcn.readlane.p0(ptr %src, i32 %src1)
- call void asm sideeffect "; use $0", "s"(ptr %x)
- ret void
-}
-
-define void @test_readlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1) {
-; CHECK-SDAG-LABEL: test_readlane_v3p0:
-; CHECK-SDAG: ; %bb.0:
-; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v8
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
-; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:9]
-; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; CHECK-GISEL-LABEL: test_readlane_v3p0:
-; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v8
-; CHECK-GISEL-NEXT: s_nop 3
-; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s9
-; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s9
-; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s9
-; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s9
-; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s9
-; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s9
-; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:9]
-; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
- %x = call <3 x ptr> @llvm.amdgcn.readlane.v3p0(<3 x ptr> %src, i32 %src1)
- call void asm sideeffect "; use $0", "s"(<3 x ptr> %x)
- ret void
-}
-
define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) {
; CHECK-SDAG-LABEL: test_readlane_v8i16:
; CHECK-SDAG: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
index 1b4ee84c75250..49f8ef391c230 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
@@ -1,6 +1,44 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s
+define void @test_readlane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_p0:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:5]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call ptr @llvm.amdgcn.readlane.p0(ptr %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(ptr %x)
+ ret void
+}
+
+define void @test_readlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v3p0:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v8
+; CHECK-SDAG-NEXT: s_nop 3
+; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; use s[4:9]
+; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %x = call <3 x ptr> @llvm.amdgcn.readlane.v3p0(<3 x ptr> %src, i32 %src1)
+ call void asm sideeffect "; use $0", "s"(<3 x ptr> %x)
+ ret void
+}
+
define void @test_readlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) {
; CHECK-SDAG-LABEL: test_readlane_p3:
; CHECK-SDAG: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index d0a865f565eeb..31f1085dd76ee 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -2651,283 +2651,6 @@ define void @test_writelane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %sr
ret void
}
-define void @test_writelane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
-; GFX802-SDAG-LABEL: test_writelane_p0:
-; GFX802-SDAG: ; %bb.0:
-; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX802-SDAG-NEXT: flat_load_dwordx2 v[5:6], v[0:1]
-; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v4
-; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3
-; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2
-; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX802-SDAG-NEXT: s_nop 0
-; GFX802-SDAG-NEXT: v_writelane_b32 v6, s4, m0
-; GFX802-SDAG-NEXT: v_writelane_b32 v5, s5, m0
-; GFX802-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[5:6]
-; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1010-SDAG-LABEL: test_writelane_p0:
-; GFX1010-SDAG: ; %bb.0:
-; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1010-SDAG-NEXT: global_load_dwordx2 v[5:6], v[0:1], off
-; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v3
-; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v4
-; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v2
-; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX1010-SDAG-NEXT: v_writelane_b32 v6, s4, s5
-; GFX1010-SDAG-NEXT: v_writelane_b32 v5, s6, s5
-; GFX1010-SDAG-NEXT: global_store_dwordx2 v[0:1], v[5:6], off
-; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-SDAG-LABEL: test_writelane_p0:
-; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT: global_load_b64 v[5:6], v[0:1], off
-; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v3
-; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v4
-; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v2
-; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s0, s1
-; GFX1100-SDAG-NEXT: v_writelane_b32 v5, s2, s1
-; GFX1100-SDAG-NEXT: global_store_b64 v[0:1], v[5:6], off
-; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX802-GISEL-LABEL: test_writelane_p0:
-; GFX802-GISEL: ; %bb.0:
-; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX802-GISEL-NEXT: flat_load_dwordx2 v[5:6], v[0:1]
-; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v4
-; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
-; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX802-GISEL-NEXT: v_writelane_b32 v5, s4, m0
-; GFX802-GISEL-NEXT: v_writelane_b32 v6, s6, m0
-; GFX802-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[5:6]
-; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1010-GISEL-LABEL: test_writelane_p0:
-; GFX1010-GISEL: ; %bb.0:
-; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1010-GISEL-NEXT: global_load_dwordx2 v[5:6], v[0:1], off
-; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v4
-; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1010-GISEL-NEXT: v_writelane_b32 v5, s4, s5
-; GFX1010-GISEL-NEXT: v_writelane_b32 v6, s6, s5
-; GFX1010-GISEL-NEXT: global_store_dwordx2 v[0:1], v[5:6], off
-; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: test_writelane_p0:
-; GFX1100-GISEL: ; %bb.0:
-; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT: global_load_b64 v[5:6], v[0:1], off
-; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
-; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v4
-; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_writelane_b32 v5, s0, s1
-; GFX1100-GISEL-NEXT: v_writelane_b32 v6, s2, s1
-; GFX1100-GISEL-NEXT: global_store_b64 v[0:1], v[5:6], off
-; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
- %oldval = load ptr, ptr addrspace(1) %out
- %writelane = call ptr @llvm.amdgcn.writelane.p0(ptr %src, i32 %src1, ptr %oldval)
- store ptr %writelane, ptr addrspace(1) %out, align 4
- ret void
-}
-
-define void @test_writelane_v3p0(ptr addrspace(1) %out, <4 x ptr> %src, i32 %src1) {
-; GFX802-SDAG-LABEL: test_writelane_v3p0:
-; GFX802-SDAG: ; %bb.0:
-; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX802-SDAG-NEXT: v_add_u32_e32 v19, vcc, 16, v0
-; GFX802-SDAG-NEXT: flat_load_dwordx4 v[11:14], v[0:1]
-; GFX802-SDAG-NEXT: v_addc_u32_e32 v20, vcc, 0, v1, vcc
-; GFX802-SDAG-NEXT: flat_load_dwordx4 v[15:18], v[19:20]
-; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v10
-; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v5
-; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v4
-; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v3
-; GFX802-SDAG-NEXT: v_readfirstlane_b32 s11, v2
-; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v9
-; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v8
-; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v7
-; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v6
-; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1)
-; GFX802-SDAG-NEXT: v_writelane_b32 v14, s8, m0
-; GFX802-SDAG-NEXT: v_writelane_b32 v13, s9, m0
-; GFX802-SDAG-NEXT: v_writelane_b32 v12, s10, m0
-; GFX802-SDAG-NEXT: v_writelane_b32 v11, s11, m0
-; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX802-SDAG-NEXT: v_writelane_b32 v18, s4, m0
-; GFX802-SDAG-NEXT: v_writelane_b32 v17, s5, m0
-; GFX802-SDAG-NEXT: v_writelane_b32 v16, s6, m0
-; GFX802-SDAG-NEXT: v_writelane_b32 v15, s7, m0
-; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[11:14]
-; GFX802-SDAG-NEXT: flat_store_dwordx4 v[19:20], v[15:18]
-; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1010-SDAG-LABEL: test_writelane_v3p0:
-; GFX1010-SDAG: ; %bb.0:
-; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1010-SDAG-NEXT: s_clause 0x1
-; GFX1010-SDAG-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:16
-; GFX1010-SDAG-NEXT: global_load_dwordx4 v[15:18], v[0:1], off
-; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v10
-; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s9, v5
-; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s10, v4
-; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s11, v3
-; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s12, v2
-; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v9
-; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v8
-; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v7
-; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v6
-; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(1)
-; GFX1010-SDAG-NEXT: v_writelane_b32 v14, s4, s5
-; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX1010-SDAG-NEXT: v_writelane_b32 v18, s9, s5
-; GFX1010-SDAG-NEXT: v_writelane_b32 v17, s10, s5
-; GFX1010-SDAG-NEXT: v_writelane_b32 v16, s11, s5
-; GFX1010-SDAG-NEXT: v_writelane_b32 v15, s12, s5
-; GFX1010-SDAG-NEXT: v_writelane_b32 v13, s6, s5
-; GFX1010-SDAG-NEXT: v_writelane_b32 v12, s7, s5
-; GFX1010-SDAG-NEXT: v_writelane_b32 v11, s8, s5
-; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
-; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[11:14], off offset:16
-; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-SDAG-LABEL: test_writelane_v3p0:
-; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT: s_clause 0x1
-; GFX1100-SDAG-NEXT: global_load_b128 v[11:14], v[0:1], off offset:16
-; GFX1100-SDAG-NEXT: global_load_b128 v[15:18], v[0:1], off
-; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v10
-; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s5, v5
-; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s7, v3
-; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s8, v2
-; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v9
-; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v8
-; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v7
-; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v6
-; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1)
-; GFX1100-SDAG-NEXT: v_writelane_b32 v14, s0, s1
-; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-SDAG-NEXT: v_writelane_b32 v18, s5, s1
-; GFX1100-SDAG-NEXT: v_writelane_b32 v17, s6, s1
-; GFX1100-SDAG-NEXT: v_writelane_b32 v16, s7, s1
-; GFX1100-SDAG-NEXT: v_writelane_b32 v15, s8, s1
-; GFX1100-SDAG-NEXT: v_writelane_b32 v13, s2, s1
-; GFX1100-SDAG-NEXT: v_writelane_b32 v12, s3, s1
-; GFX1100-SDAG-NEXT: v_writelane_b32 v11, s4, s1
-; GFX1100-SDAG-NEXT: s_clause 0x1
-; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[15:18], off
-; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[11:14], off offset:16
-; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX802-GISEL-LABEL: test_writelane_v3p0:
-; GFX802-GISEL: ; %bb.0:
-; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX802-GISEL-NEXT: v_add_u32_e32 v19, vcc, 16, v0
-; GFX802-GISEL-NEXT: flat_load_dwordx4 v[11:14], v[0:1]
-; GFX802-GISEL-NEXT: v_addc_u32_e32 v20, vcc, 0, v1, vcc
-; GFX802-GISEL-NEXT: flat_load_dwordx4 v[15:18], v[19:20]
-; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v10
-; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4
-; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5
-; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
-; GFX802-GISEL-NEXT: v_readfirstlane_b32 s9, v6
-; GFX802-GISEL-NEXT: v_readfirstlane_b32 s10, v7
-; GFX802-GISEL-NEXT: v_readfirstlane_b32 s11, v8
-; GFX802-GISEL-NEXT: v_readfirstlane_b32 s12, v9
-; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1)
-; GFX802-GISEL-NEXT: v_writelane_b32 v11, s4, m0
-; GFX802-GISEL-NEXT: v_writelane_b32 v12, s6, m0
-; GFX802-GISEL-NEXT: v_writelane_b32 v13, s7, m0
-; GFX802-GISEL-NEXT: v_writelane_b32 v14, s8, m0
-; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX802-GISEL-NEXT: v_writelane_b32 v15, s9, m0
-; GFX802-GISEL-NEXT: v_writelane_b32 v16, s10, m0
-; GFX802-GISEL-NEXT: v_writelane_b32 v17, s11, m0
-; GFX802-GISEL-NEXT: v_writelane_b32 v18, s12, m0
-; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[11:14]
-; GFX802-GISEL-NEXT: flat_store_dwordx4 v[19:20], v[15:18]
-; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1010-GISEL-LABEL: test_writelane_v3p0:
-; GFX1010-GISEL: ; %bb.0:
-; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1010-GISEL-NEXT: s_clause 0x1
-; GFX1010-GISEL-NEXT: global_load_dwordx4 v[11:14], v[0:1], off
-; GFX1010-GISEL-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:16
-; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v10
-; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v4
-; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v5
-; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s9, v6
-; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s10, v7
-; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s11, v8
-; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s12, v9
-; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(1)
-; GFX1010-GISEL-NEXT: v_writelane_b32 v11, s4, s5
-; GFX1010-GISEL-NEXT: v_writelane_b32 v12, s6, s5
-; GFX1010-GISEL-NEXT: v_writelane_b32 v13, s7, s5
-; GFX1010-GISEL-NEXT: v_writelane_b32 v14, s8, s5
-; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1010-GISEL-NEXT: v_writelane_b32 v15, s9, s5
-; GFX1010-GISEL-NEXT: v_writelane_b32 v16, s10, s5
-; GFX1010-GISEL-NEXT: v_writelane_b32 v17, s11, s5
-; GFX1010-GISEL-NEXT: v_writelane_b32 v18, s12, s5
-; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
-; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[15:18], off offset:16
-; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: test_writelane_v3p0:
-; GFX1100-GISEL: ; %bb.0:
-; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT: s_clause 0x1
-; GFX1100-GISEL-NEXT: global_load_b128 v[11:14], v[0:1], off
-; GFX1100-GISEL-NEXT: global_load_b128 v[15:18], v[0:1], off offset:16
-; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
-; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v10
-; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v4
-; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v5
-; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s5, v6
-; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s6, v7
-; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s7, v8
-; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s8, v9
-; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(1)
-; GFX1100-GISEL-NEXT: v_writelane_b32 v11, s0, s1
-; GFX1100-GISEL-NEXT: v_writelane_b32 v12, s2, s1
-; GFX1100-GISEL-NEXT: v_writelane_b32 v13, s3, s1
-; GFX1100-GISEL-NEXT: v_writelane_b32 v14, s4, s1
-; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-GISEL-NEXT: v_writelane_b32 v15, s5, s1
-; GFX1100-GISEL-NEXT: v_writelane_b32 v16, s6, s1
-; GFX1100-GISEL-NEXT: v_writelane_b32 v17, s7, s1
-; GFX1100-GISEL-NEXT: v_writelane_b32 v18, s8, s1
-; GFX1100-GISEL-NEXT: s_clause 0x1
-; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[11:14], off
-; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[15:18], off offset:16
-; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
- %oldval = load <4 x ptr>, ptr addrspace(1) %out
- %writelane = call <4 x ptr> @llvm.amdgcn.writelane.v3p0(<4 x ptr> %src, i32 %src1, <4 x ptr> %oldval)
- store <4 x ptr> %writelane, ptr addrspace(1) %out, align 4
- ret void
-}
-
define void @test_writelane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) {
; GFX802-SDAG-LABEL: test_writelane_v8i16:
; GFX802-SDAG: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
index afc394627d356..c45c73dd88150 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
@@ -3,6 +3,150 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s
+define void @test_writelane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_p0:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_dwordx2 v[5:6], v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 0
+; GFX802-SDAG-NEXT: v_writelane_b32 v6, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v5, s5, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[5:6]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_p0:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_dwordx2 v[5:6], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v2
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v6, s4, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v5, s6, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx2 v[0:1], v[5:6], off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_p0:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_b64 v[5:6], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v2
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s0, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v5, s2, s1
+; GFX1100-SDAG-NEXT: global_store_b64 v[0:1], v[5:6], off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load ptr, ptr addrspace(1) %out
+ %writelane = call ptr @llvm.amdgcn.writelane.p0(ptr %src, i32 %src1, ptr %oldval)
+ store ptr %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_writelane_v3p0(ptr addrspace(1) %out, <4 x ptr> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v3p0:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: v_add_u32_e32 v19, vcc, 16, v0
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[11:14], v[0:1]
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v20, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[15:18], v[19:20]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v10
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v5
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v4
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s11, v2
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v9
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v8
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v7
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v6
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX802-SDAG-NEXT: v_writelane_b32 v14, s8, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v13, s9, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v12, s10, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v11, s11, m0
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_writelane_b32 v18, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v17, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v16, s6, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v15, s7, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[11:14]
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[19:20], v[15:18]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v3p0:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: s_clause 0x1
+; GFX1010-SDAG-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:16
+; GFX1010-SDAG-NEXT: global_load_dwordx4 v[15:18], v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v10
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s9, v5
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s10, v4
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s11, v3
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s12, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v9
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v8
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v7
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v6
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v14, s4, s5
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v18, s9, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v17, s10, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v16, s11, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v15, s12, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v13, s6, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v12, s7, s5
+; GFX1010-SDAG-NEXT: v_writelane_b32 v11, s8, s5
+; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
+; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[11:14], off offset:16
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v3p0:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: global_load_b128 v[11:14], v[0:1], off offset:16
+; GFX1100-SDAG-NEXT: global_load_b128 v[15:18], v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s5, v5
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s7, v3
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s8, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v9
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v8
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v7
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v6
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v14, s0, s1
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v18, s5, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v17, s6, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v16, s7, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v15, s8, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v13, s2, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v12, s3, s1
+; GFX1100-SDAG-NEXT: v_writelane_b32 v11, s4, s1
+; GFX1100-SDAG-NEXT: s_clause 0x1
+; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[15:18], off
+; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[11:14], off offset:16
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load <4 x ptr>, ptr addrspace(1) %out
+ %writelane = call <4 x ptr> @llvm.amdgcn.writelane.v3p0(<4 x ptr> %src, i32 %src1, <4 x ptr> %oldval)
+ store <4 x ptr> %writelane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
define void @test_writelane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) {
; GFX802-SDAG-LABEL: test_writelane_p3:
; GFX802-SDAG: ; %bb.0:
>From 12155f59f4b08de99b05d8744857c68e8adfce85 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Fri, 31 May 2024 09:38:25 +0000
Subject: [PATCH 07/11] add hepler to emit N-ary builtins
---
clang/lib/CodeGen/CGBuiltin.cpp | 31 ++++++++++++++++---------------
1 file changed, 16 insertions(+), 15 deletions(-)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d9ab0051f0d23..a21aa85821375 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -614,6 +614,17 @@ static Value *emitTernaryBuiltin(CodeGenFunction &CGF,
return CGF.Builder.CreateCall(F, { Src0, Src1, Src2 });
}
+// Emit an intrinsic that has N operands of the same type as its result.
+static Value *emitNaryBuiltin(CodeGenFunction &CGF, const CallExpr *E,
+ unsigned IntrinsicID) {
+ SmallVector<Value *, 16> Args;
+ for (unsigned i = 0, e = E->getNumArgs(); i != e; i++)
+ Args.push_back(CGF.EmitScalarExpr(E->getArg(i)));
+
+ Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Args[0]->getType());
+ return CGF.Builder.CreateCall(F, Args);
+}
+
// Emit an intrinsic that has 1 float or double operand, and 1 integer.
static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
const CallExpr *E,
@@ -18480,21 +18491,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
return Builder.CreateCall(F, Args);
}
case AMDGPU::BI__builtin_amdgcn_permlane16:
- case AMDGPU::BI__builtin_amdgcn_permlanex16: {
- llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
- llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
- llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
- llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
- llvm::Value *Src4 = EmitScalarExpr(E->getArg(4));
- llvm::Value *Src5 = EmitScalarExpr(E->getArg(5));
-
- Intrinsic::ID IID = BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
- ? Intrinsic::amdgcn_permlane16
- : Intrinsic::amdgcn_permlanex16;
-
- llvm::Function *F = CGM.getIntrinsic(IID, Src0->getType());
- return Builder.CreateCall(F, {Src0, Src1, Src2, Src3, Src4, Src5});
- }
+ case AMDGPU::BI__builtin_amdgcn_permlanex16:
+ return emitNaryBuiltin(*this, E,
+ BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
+ ? Intrinsic::amdgcn_permlane16
+ : Intrinsic::amdgcn_permlanex16);
case AMDGPU::BI__builtin_amdgcn_permlane64:
return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_permlane64);
case AMDGPU::BI__builtin_amdgcn_readlane:
>From 40381ca63182f7986a1bf723128cfdfa6b387557 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Mon, 17 Jun 2024 12:09:20 +0000
Subject: [PATCH 08/11] update with latest changes from #89217
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 6 -
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 7 --
llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 54 --------
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 115 +++++++++++++-----
llvm/lib/Target/AMDGPU/SIInstructions.td | 2 +-
llvm/lib/Target/AMDGPU/VOP1Instructions.td | 4 +-
llvm/lib/Target/AMDGPU/VOP2Instructions.td | 4 +-
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 8 +-
.../AMDGPU/llvm.amdgcn.readfirstlane.ll | 5 +-
.../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 7 +-
10 files changed, 96 insertions(+), 116 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 47347ada935be..18193d8807597 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5508,12 +5508,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(LDS)
NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
- NODE_NAME_CASE(READLANE)
- NODE_NAME_CASE(READFIRSTLANE)
- NODE_NAME_CASE(WRITELANE)
- NODE_NAME_CASE(PERMLANE16)
- NODE_NAME_CASE(PERMLANEX16)
- NODE_NAME_CASE(PERMLANE64)
NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
NODE_NAME_CASE(LOAD_D16_HI)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 293fa6259fef7..71c4334029b43 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -558,13 +558,6 @@ enum NodeType : unsigned {
FPTRUNC_ROUND_UPWARD,
FPTRUNC_ROUND_DOWNWARD,
- READLANE,
- READFIRSTLANE,
- WRITELANE,
- PERMLANE16,
- PERMLANEX16,
- PERMLANE64,
-
DUMMY_CHAIN,
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
LOAD_D16_HI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index a82f48950f493..702f6e67c5527 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -342,34 +342,6 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",
def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
-def AMDGPUReadfirstlaneOp : SDTypeProfile<1, 1, [
- SDTCisSameAs<0, 1>
-]>;
-
-def AMDGPUReadlaneOp : SDTypeProfile<1, 2, [
- SDTCisSameAs<0, 1>, SDTCisInt<2>
-]>;
-
-def AMDGPUDWritelaneOp : SDTypeProfile<1, 3, [
- SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisSameAs<0, 3>
-]>;
-
-def AMDGPUDPermlane16Op : SDTypeProfile<1, 6, [
- SDTCisSameAs<0, 1>, // old
- SDTCisSameAs<0, 2>, // src0
- SDTCisInt<3>, // src1
- SDTCisInt<4>, // src2
- SDTCisInt<5>, // i1 fi
- SDTCisInt<6> // i1 bound_ctrl
-]>;
-
-def AMDGPUreadlane_impl : SDNode<"AMDGPUISD::READLANE", AMDGPUReadlaneOp>;
-def AMDGPUreadfirstlane_impl : SDNode<"AMDGPUISD::READFIRSTLANE", AMDGPUReadfirstlaneOp>;
-def AMDGPUwritelane_impl : SDNode<"AMDGPUISD::WRITELANE", AMDGPUDWritelaneOp>;
-def AMDGPUpermlane16_impl : SDNode<"AMDGPUISD::PERMLANE16", AMDGPUDPermlane16Op>;
-def AMDGPUpermlanex16_impl : SDNode<"AMDGPUISD::PERMLANEX16", AMDGPUDPermlane16Op>;
-def AMDGPUpermlane64_impl : SDNode<"AMDGPUISD::PERMLANE64", AMDGPUReadfirstlaneOp>;
-
// SI+ export
def AMDGPUExportOp : SDTypeProfile<0, 8, [
SDTCisInt<0>, // i8 tgt
@@ -534,29 +506,3 @@ def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$vcc
def AMDGPUperm : PatFrags<(ops node:$src0, node:$src1, node:$src2),
[(int_amdgcn_perm node:$src0, node:$src1, node:$src2),
(AMDGPUperm_impl node:$src0, node:$src1, node:$src2)]>;
-
-def AMDGPUreadlane : PatFrags<(ops node:$src0, node:$src1),
- [(int_amdgcn_readlane node:$src0, node:$src1),
- (AMDGPUreadlane_impl node:$src0, node:$src1)]>;
-
-def AMDGPUreadfirstlane : PatFrags<(ops node:$src),
- [(int_amdgcn_readfirstlane node:$src),
- (AMDGPUreadfirstlane_impl node:$src)]>;
-
-def AMDGPUwritelane : PatFrags<(ops node:$src0, node:$src1, node:$src2),
- [(int_amdgcn_writelane node:$src0, node:$src1, node:$src2),
- (AMDGPUwritelane_impl node:$src0, node:$src1, node:$src2)]>;
-
-def AMDGPUpermlane16 : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5),
- [(int_amdgcn_permlane16 node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5),
- (AMDGPUpermlane16_impl node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5)]>;
-
-def AMDGPUpermlanex16 : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5),
- [(int_amdgcn_permlanex16 node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5),
- (AMDGPUpermlanex16_impl node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5)]>;
-
-def AMDGPUpermlane64 : PatFrags<(ops node:$src),
- [(int_amdgcn_permlane64 node:$src),
- (AMDGPUpermlane64_impl node:$src)]>;
-
-
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d91ab31832070..f55025ec5e08d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6095,46 +6095,55 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
unsigned ValSize = VT.getSizeInBits();
- unsigned IntrinsicID = N->getConstantOperandVal(0);
- bool IsPermLane16 = IntrinsicID == Intrinsic::amdgcn_permlane16 ||
- IntrinsicID == Intrinsic::amdgcn_permlanex16;
- SDValue Src0 = N->getOperand(1);
+ unsigned IID = N->getConstantOperandVal(0);
+ bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
+ IID == Intrinsic::amdgcn_permlanex16;
SDLoc SL(N);
MVT IntVT = MVT::getIntegerVT(ValSize);
- auto createLaneOp = [&DAG, &SL, N](SDValue Src0, SDValue Src1, SDValue Src2,
- MVT ValueT) -> SDValue {
- switch (unsigned IID = N->getConstantOperandVal(0)) {
+ auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1, SDValue Src2,
+ MVT ValT) -> SDValue {
+ SmallVector<SDValue, 8> Operands;
+ switch (IID) {
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16:
+ Operands.push_back(N->getOperand(6));
+ Operands.push_back(N->getOperand(5));
+ Operands.push_back(N->getOperand(4));
+ [[fallthrough]];
+ case Intrinsic::amdgcn_writelane:
+ Operands.push_back(Src2);
+ [[fallthrough]];
+ case Intrinsic::amdgcn_readlane:
+ Operands.push_back(Src1);
+ [[fallthrough]];
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_permlane64:
- return DAG.getNode(IID == Intrinsic::amdgcn_readfirstlane
- ? AMDGPUISD::READFIRSTLANE
- : AMDGPUISD::PERMLANE64,
- SL, ValueT, {Src0});
- case Intrinsic::amdgcn_readlane:
- return DAG.getNode(AMDGPUISD::READLANE, SL, ValueT, {Src0, Src1});
- case Intrinsic::amdgcn_writelane:
- return DAG.getNode(AMDGPUISD::WRITELANE, SL, ValueT, {Src0, Src1, Src2});
- case Intrinsic::amdgcn_permlane16:
- case Intrinsic::amdgcn_permlanex16: {
- SDValue Src3 = N->getOperand(4);
- SDValue Src4 = N->getOperand(5);
- SDValue Src5 = N->getOperand(6);
- return DAG.getNode(IID == Intrinsic::amdgcn_permlane16
- ? AMDGPUISD::PERMLANE16
- : AMDGPUISD::PERMLANEX16,
- SL, ValueT, {Src0, Src1, Src2, Src3, Src4, Src5});
- }
+ Operands.push_back(Src0);
+ break;
default:
llvm_unreachable("unhandled lane op");
}
+
+ Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
+ std::reverse(Operands.begin(), Operands.end());
+
+ if (SDNode *GL = N->getGluedNode()) {
+ assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
+ GL = GL->getOperand(0).getNode();
+ Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
+ SDValue(GL, 0)));
+ }
+
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
};
+ SDValue Src0 = N->getOperand(1);
SDValue Src1, Src2;
- if (IntrinsicID == Intrinsic::amdgcn_readlane ||
- IntrinsicID == Intrinsic::amdgcn_writelane || IsPermLane16) {
+ if (IID == Intrinsic::amdgcn_readlane ||
+ IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
Src1 = N->getOperand(2);
- if (IntrinsicID == Intrinsic::amdgcn_writelane || IsPermLane16)
+ if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
Src2 = N->getOperand(3);
}
@@ -6153,7 +6162,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
SL, MVT::i32);
}
- if (IntrinsicID == Intrinsic::amdgcn_writelane) {
+ if (IID == Intrinsic::amdgcn_writelane) {
Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
SL, MVT::i32);
}
@@ -6165,6 +6174,46 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
if (ValSize % 32 != 0)
return SDValue();
+
+ auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
+ EVT VT = N->getValueType(0);
+ unsigned NE = VT.getVectorNumElements();
+ EVT EltVT = VT.getVectorElementType();
+ SmallVector<SDValue, 8> Scalars;
+ unsigned NumOperands = N->getNumOperands();
+ SmallVector<SDValue, 4> Operands(NumOperands);
+ SDNode *GL = N->getGluedNode();
+
+ // only handle convergencectrl_glue
+ assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
+
+ for (unsigned i = 0; i != NE; ++i) {
+ for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
+ ++j) {
+ SDValue Operand = N->getOperand(j);
+ EVT OperandVT = Operand.getValueType();
+ if (OperandVT.isVector()) {
+ // A vector operand; extract a single element.
+ EVT OperandEltVT = OperandVT.getVectorElementType();
+ Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
+ Operand, DAG.getVectorIdxConstant(i, SL));
+ } else {
+ // A scalar operand; just use it as is.
+ Operands[j] = Operand;
+ }
+ }
+
+ if (GL)
+ Operands[NumOperands - 1] =
+ DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
+ SDValue(GL->getOperand(0).getNode(), 0));
+
+ Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
+ }
+
+ EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
+ return DAG.getBuildVector(VecVT, SL, Scalars);
+ };
if (VT.isVector()) {
switch (MVT::SimpleValueType EltTy =
@@ -6172,7 +6221,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
case MVT::i32:
case MVT::f32: {
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
- return DAG.UnrollVectorOp(LaneOp.getNode());
+ return unrollLaneOp(LaneOp.getNode());
}
case MVT::i16:
case MVT::f16:
@@ -6188,7 +6237,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
DAG.getConstant(EltIdx, SL, MVT::i32));
- if (IntrinsicID == Intrinsic::amdgcn_writelane)
+ if (IID == Intrinsic::amdgcn_writelane)
Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
DAG.getConstant(EltIdx, SL, MVT::i32));
@@ -6212,11 +6261,11 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
if (IsPermLane16)
Src1 = DAG.getBitcast(VecVT, Src1);
- if (IntrinsicID == Intrinsic::amdgcn_writelane)
+ if (IID == Intrinsic::amdgcn_writelane)
Src2 = DAG.getBitcast(VecVT, Src2);
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
- SDValue UnrolledLaneOp = DAG.UnrollVectorOp(LaneOp.getNode());
+ SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
return DAG.getBitcast(VT, UnrolledLaneOp);
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 5941b264fdb21..a5e59fde107ea 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3389,7 +3389,7 @@ def : GCNPat<
// FIXME: Should also do this for readlane, but tablegen crashes on
// the ignored src1.
def : GCNPat<
- (i32 (AMDGPUreadfirstlane (i32 imm:$src))),
+ (i32 (int_amdgcn_readfirstlane (i32 imm:$src))),
(S_MOV_B32 SReg_32:$src)
>;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 71c83e072b4b0..2c0d61ee4afa1 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -255,7 +255,7 @@ def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE,
}
foreach vt = Reg32Types.types in {
- def : GCNPat<(vt (AMDGPUreadfirstlane (vt VRegOrLdsSrc_32:$src0))),
+ def : GCNPat<(vt (int_amdgcn_readfirstlane (vt VRegOrLdsSrc_32:$src0))),
(V_READFIRSTLANE_B32 (vt VRegOrLdsSrc_32:$src0))
>;
}
@@ -743,7 +743,7 @@ let SubtargetPredicate = isGFX11Plus in {
} // End SubtargetPredicate = isGFX11Plus
foreach vt = Reg32Types.types in {
- def : GCNPat<(AMDGPUpermlane64 (vt VRegSrc_32:$src0)),
+ def : GCNPat<(int_amdgcn_permlane64 (vt VRegSrc_32:$src0)),
(vt (V_PERMLANE64_B32 (vt VRegSrc_32:$src0)))
>;
}
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 40106314cb0e6..ae4030e509e9c 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -789,11 +789,11 @@ def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []> {
} // End isConvergent = 1
foreach vt = Reg32Types.types in {
- def : GCNPat<(vt (AMDGPUreadlane vt:$src0, i32:$src1)),
+ def : GCNPat<(vt (int_amdgcn_readlane vt:$src0, i32:$src1)),
(V_READLANE_B32 VRegOrLdsSrc_32:$src0, SCSrc_b32:$src1)
>;
- def : GCNPat<(vt (AMDGPUwritelane vt:$src0, i32:$src1, vt:$src2)),
+ def : GCNPat<(vt (int_amdgcn_writelane vt:$src0, i32:$src1, vt:$src2)),
(V_WRITELANE_B32 SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$src2)
>;
}
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index ff098f8fe50f3..6ba54cca44958 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -841,8 +841,8 @@ def gi_opsel_i1timm : GICustomOperandRenderer<"renderOpSelTImm">,
class PermlanePat<SDPatternOperator permlane,
Instruction inst, ValueType vt> : GCNPat<
- (permlane vt:$vdst_in, vt:$src0, i32:$src1, i32:$src2,
- timm:$fi, timm:$bc),
+ (vt (permlane vt:$vdst_in, vt:$src0, i32:$src1, i32:$src2,
+ timm:$fi, timm:$bc)),
(inst (opsel_i1timm $fi), VGPR_32:$src0, (opsel_i1timm $bc),
SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in)
>;
@@ -867,8 +867,8 @@ let SubtargetPredicate = isGFX10Plus in {
} // End $vdst = $vdst_in, DisableEncoding $vdst_in, IsInvalidSingleUseConsumer = 1, IsInvalidSingleUseProducer = 1
foreach vt = Reg32Types.types in {
- def : PermlanePat<AMDGPUpermlane16, V_PERMLANE16_B32_e64, vt>;
- def : PermlanePat<AMDGPUpermlanex16, V_PERMLANEX16_B32_e64, vt>;
+ def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>;
+ def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64, vt>;
}
defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index ed0da0d2a61a2..cc6c630ae6466 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -463,10 +463,9 @@ define void @test_readfirstlane_i16(ptr addrspace(1) %out, i16 %src) {
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0xffff
-; CHECK-SDAG-NEXT: v_and_b32_e32 v0, s4, v0
+; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 0xffff
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use v0
+; CHECK-SDAG-NEXT: ; use s4
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 325a39abb588a..66e1f9396de5a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -743,12 +743,11 @@ define void @test_readlane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) {
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0xffff
-; CHECK-SDAG-NEXT: s_nop 2
+; CHECK-SDAG-NEXT: s_nop 3
; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
-; CHECK-SDAG-NEXT: v_and_b32_e32 v0, s4, v0
+; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 0xffff
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use v0
+; CHECK-SDAG-NEXT: ; use s4
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
>From 4e4cdd961e3daea74e2ae5689fea73c0b6685f57 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Mon, 17 Jun 2024 08:16:39 -0400
Subject: [PATCH 09/11] clang format
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f55025ec5e08d..1101de206a8fe 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6101,8 +6101,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
SDLoc SL(N);
MVT IntVT = MVT::getIntegerVT(ValSize);
- auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1, SDValue Src2,
- MVT ValT) -> SDValue {
+ auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
+ SDValue Src2, MVT ValT) -> SDValue {
SmallVector<SDValue, 8> Operands;
switch (IID) {
case Intrinsic::amdgcn_permlane16:
@@ -6140,8 +6140,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
SDValue Src0 = N->getOperand(1);
SDValue Src1, Src2;
- if (IID == Intrinsic::amdgcn_readlane ||
- IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
+ if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
+ IsPermLane16) {
Src1 = N->getOperand(2);
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
Src2 = N->getOperand(3);
@@ -6174,7 +6174,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
if (ValSize % 32 != 0)
return SDValue();
-
+
auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
EVT VT = N->getValueType(0);
unsigned NE = VT.getVectorNumElements();
>From 2091436a012063a7ae539c3cbe9b0c5b14b9bb01 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Wed, 5 Jun 2024 05:45:36 +0000
Subject: [PATCH 10/11] [AMDGPU] Enable atomic optimizer for 64 bit values
---
.../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 104 +-
.../global-atomic-fadd.f32-no-rtn.ll | 23 +-
.../GlobalISel/global-atomic-fadd.f32-rtn.ll | 39 +-
.../GlobalISel/global-atomic-fadd.f64.ll | 1346 +++++-
.../atomic_optimizations_global_pointer.ll | 1150 ++++-
.../atomic_optimizations_local_pointer.ll | 798 +++-
.../AMDGPU/global-atomic-fadd.f32-no-rtn.ll | 33 +-
.../AMDGPU/global-atomic-fadd.f32-rtn.ll | 29 +-
.../CodeGen/AMDGPU/global-atomic-fadd.f64.ll | 1332 +++++-
.../AMDGPU/global_atomic_optimizer_fp_rtn.ll | 1672 ++++---
.../global_atomics_iterative_scan_fp.ll | 146 +-
.../global_atomics_optimizer_fp_no_rtn.ll | 906 ++--
.../AMDGPU/global_atomics_scan_fadd.ll | 4190 +++++++++++-----
.../AMDGPU/global_atomics_scan_fmax.ll | 2569 +++++++---
.../AMDGPU/global_atomics_scan_fmin.ll | 2569 +++++++---
.../AMDGPU/global_atomics_scan_fsub.ll | 4194 ++++++++++++-----
16 files changed, 15654 insertions(+), 5446 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index d7ef6f3c5dc43..f204522293e6c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -228,10 +228,11 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
// If the value operand is divergent, each lane is contributing a different
// value to the atomic calculation. We can only optimize divergent values if
- // we have DPP available on our subtarget, and the atomic operation is 32
- // bits.
+ // we have DPP available on our subtarget, and the atomic operation is either
+ // 32 or 64 bits.
if (ValDivergent &&
- (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
+ (!ST->hasDPP() || (DL->getTypeSizeInBits(I.getType()) != 32 &&
+ DL->getTypeSizeInBits(I.getType()) != 64))) {
return;
}
@@ -311,10 +312,11 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
// If the value operand is divergent, each lane is contributing a different
// value to the atomic calculation. We can only optimize divergent values if
- // we have DPP available on our subtarget, and the atomic operation is 32
- // bits.
+ // we have DPP available on our subtarget, and the atomic operation is 32 or
+ // 64 bits.
if (ValDivergent &&
- (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
+ (!ST->hasDPP() || (DL->getTypeSizeInBits(I.getType()) != 32 &&
+ DL->getTypeSizeInBits(I.getType()) != 64))) {
return;
}
@@ -386,7 +388,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
Value *V,
Value *const Identity) const {
Type *AtomicTy = V->getType();
- Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -402,34 +403,30 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
// Reduce within each pair of rows (i.e. 32 lanes).
assert(ST->hasPermLaneX16());
- V = B.CreateBitCast(V, IntNTy);
Value *Permlanex16Call = B.CreateIntrinsic(
V->getType(), Intrinsic::amdgcn_permlanex16,
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
- V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
- B.CreateBitCast(Permlanex16Call, AtomicTy));
+ V = buildNonAtomicBinOp(B, Op, V,
+ Permlanex16Call);
if (ST->isWave32()) {
return V;
}
if (ST->hasPermLane64()) {
// Reduce across the upper and lower 32 lanes.
- V = B.CreateBitCast(V, IntNTy);
Value *Permlane64Call =
B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V);
- return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
- B.CreateBitCast(Permlane64Call, AtomicTy));
+ return buildNonAtomicBinOp(B, Op, V,
+ Permlane64Call);
}
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
// combine them with a scalar operation.
Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
- V = B.CreateBitCast(V, IntNTy);
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy);
Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
- return buildNonAtomicBinOp(B, Op, B.CreateBitCast(Lane0, AtomicTy),
- B.CreateBitCast(Lane32, AtomicTy));
+ return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
}
// Use the builder to create an inclusive scan of V across the wavefront, with
@@ -438,7 +435,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
AtomicRMWInst::BinOp Op, Value *V,
Value *Identity) const {
Type *AtomicTy = V->getType();
- Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
@@ -470,20 +466,18 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
// 48..63).
assert(ST->hasPermLaneX16());
- V = B.CreateBitCast(V, IntNTy);
Value *PermX = B.CreateIntrinsic(
V->getType(), Intrinsic::amdgcn_permlanex16,
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
Value *UpdateDPPCall =
- B.CreateCall(UpdateDPP, {Identity, B.CreateBitCast(PermX, AtomicTy),
+ B.CreateCall(UpdateDPP, {Identity, PermX,
B.getInt32(DPP::QUAD_PERM_ID), B.getInt32(0xa),
B.getInt32(0xf), B.getFalse()});
- V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), UpdateDPPCall);
+ V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);
if (!ST->isWave32()) {
// Combine lane 31 into lanes 32..63.
- V = B.CreateBitCast(V, IntNTy);
Value *const Lane31 = B.CreateIntrinsic(
V->getType(), Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
@@ -491,7 +485,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
B.getInt32(0xc), B.getInt32(0xf), B.getFalse()});
- V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
+ V = buildNonAtomicBinOp(B, Op, V,
UpdateDPPCall);
}
}
@@ -503,8 +497,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
Value *Identity) const {
Type *AtomicTy = V->getType();
- Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
-
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -515,9 +507,9 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
B.getInt32(0xf), B.getFalse()});
} else {
Function *ReadLane = Intrinsic::getDeclaration(
- M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
+ M, Intrinsic::amdgcn_readlane, AtomicTy);
Function *WriteLane = Intrinsic::getDeclaration(
- M, Intrinsic::amdgcn_writelane, B.getInt32Ty());
+ M, Intrinsic::amdgcn_writelane, AtomicTy);
// On GFX10 all DPP operations are confined to a single row. To get cross-
// row operations we have to use permlane or readlane.
@@ -529,22 +521,18 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
// Copy the old lane 15 to the new lane 16.
V = B.CreateCall(
WriteLane,
- {B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy), B.getInt32(15)}),
- B.getInt32(16), B.CreateBitCast(V, IntNTy)});
- V = B.CreateBitCast(V, AtomicTy);
+ {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
+ B.getInt32(16), V});
if (!ST->isWave32()) {
// Copy the old lane 31 to the new lane 32.
- V = B.CreateBitCast(V, IntNTy);
V = B.CreateCall(WriteLane,
- {B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy),
- B.getInt32(31)}),
+ {B.CreateCall(ReadLane, {Old, B.getInt32(31)}),
B.getInt32(32), V});
// Copy the old lane 47 to the new lane 48.
V = B.CreateCall(
WriteLane,
{B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
- V = B.CreateBitCast(V, AtomicTy);
}
}
@@ -584,24 +572,20 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
auto *FF1 =
B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()});
- Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits());
- auto *LaneIdxInt = B.CreateTrunc(FF1, IntNTy);
+ auto *LaneIdxInt = B.CreateTrunc(FF1, B.getInt32Ty());
// Get the value required for atomic operation
- V = B.CreateBitCast(V, IntNTy);
Value *LaneValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_readlane,
{V, LaneIdxInt});
- LaneValue = B.CreateBitCast(LaneValue, Ty);
// Perform writelane if intermediate scan results are required later in the
// kernel computations
Value *OldValue = nullptr;
if (NeedResult) {
OldValue =
- B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_writelane,
- {B.CreateBitCast(Accumulator, IntNTy), LaneIdxInt,
- B.CreateBitCast(OldValuePhi, IntNTy)});
- OldValue = B.CreateBitCast(OldValue, Ty);
+ B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_writelane,
+ {Accumulator, LaneIdxInt,
+ OldValuePhi});
OldValuePhi->addIncoming(OldValue, ComputeLoop);
}
@@ -700,10 +684,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
Type *const Ty = I.getType();
Type *Int32Ty = B.getInt32Ty();
- Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits());
bool isAtomicFloatingPointTy = Ty->isFloatingPointTy();
const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
- auto *const VecTy = FixedVectorType::get(Int32Ty, 2);
// This is the value in the atomic operation we need to combine in order to
// reduce the number of atomic operations.
@@ -758,13 +740,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
if (ScanImpl == ScanOptions::DPP) {
// First we need to set all inactive invocations to the identity value, so
// that they can correctly contribute to the final result.
- V = B.CreateBitCast(V, IntNTy);
- Identity = B.CreateBitCast(Identity, IntNTy);
- NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, IntNTy,
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty,
{V, Identity});
- NewV = B.CreateBitCast(NewV, Ty);
- V = B.CreateBitCast(V, Ty);
- Identity = B.CreateBitCast(Identity, Ty);
if (!NeedResult && ST->hasPermLaneX16()) {
// On GFX10 the permlanex16 instruction helps us build a reduction
// without too many readlanes and writelanes, which are generally bad
@@ -778,11 +755,9 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// of each active lane in the wavefront. This will be our new value
// which we will provide to the atomic operation.
Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
- assert(TyBitWidth == 32);
- NewV = B.CreateBitCast(NewV, IntNTy);
- NewV = B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_readlane,
+ assert(TyBitWidth == 32 || TyBitWidth == 64);
+ NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
{NewV, LastLaneIdx});
- NewV = B.CreateBitCast(NewV, Ty);
}
// Finally mark the readlanes in the WWM section.
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
@@ -922,26 +897,9 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// but have to handle 64-bit broadcasts with two calls to this intrinsic.
Value *BroadcastI = nullptr;
- if (TyBitWidth == 64) {
- Value *CastedPhi = B.CreateBitCast(PHI, IntNTy);
- Value *const ExtractLo = B.CreateTrunc(CastedPhi, Int32Ty);
- Value *const ExtractHi =
- B.CreateTrunc(B.CreateLShr(CastedPhi, 32), Int32Ty);
- CallInst *const ReadFirstLaneLo = B.CreateIntrinsic(
- Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractLo);
- CallInst *const ReadFirstLaneHi = B.CreateIntrinsic(
- Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractHi);
- Value *const PartialInsert = B.CreateInsertElement(
- PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
- Value *const Insert =
- B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
- BroadcastI = B.CreateBitCast(Insert, Ty);
- } else if (TyBitWidth == 32) {
- Value *CastedPhi = B.CreateBitCast(PHI, IntNTy);
+ if (TyBitWidth == 32 || TyBitWidth == 64) {
BroadcastI =
- B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_readfirstlane, CastedPhi);
- BroadcastI = B.CreateBitCast(BroadcastI, Ty);
-
+ B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI);
} else {
llvm_unreachable("Unhandled atomic bit width");
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
index 21832dc320e42..3f0b86c271538 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
@@ -169,30 +169,29 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY8]], [[COPY9]], implicit $exec
; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
; GFX90A_GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
- ; GFX90A_GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
+ ; GFX90A_GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
- ; GFX90A_GFX940-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
- ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
+ ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
+ ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
+ ; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
+ ; GFX90A_GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
+ ; GFX90A_GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
+ ; GFX90A_GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec
- ; GFX90A_GFX940-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 63
- ; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_4]]
+ ; GFX90A_GFX940-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_3]]
; GFX90A_GFX940-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]]
; GFX90A_GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec
; GFX90A_GFX940-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
@@ -200,7 +199,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_BRANCH %bb.3
; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: bb.3 (%ir-block.35):
+ ; GFX90A_GFX940-NEXT: bb.3 (%ir-block.31):
; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000)
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -211,7 +210,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.37):
+ ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.33):
; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
index e48d281f37c9a..676eae1bad85d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
@@ -171,35 +171,34 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[COPY5]], implicit $exec
- ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
+ ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY6]], implicit-def dead $scc, implicit $exec
- ; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
- ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
+ ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY7]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
+ ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; GFX11-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY8]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
+ ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; GFX11-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY9]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
+ ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; GFX11-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY10]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
- ; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_3]], 0, [[S_MOV_B32_3]], [[V_ADD_F32_e64_3]], 0, implicit $exec
- ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
+ ; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+ ; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[V_ADD_F32_e64_3]], 0, implicit $exec
+ ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; GFX11-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY11]], [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
+ ; GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; GFX11-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_ADD_F32_e64_4]], 273, 15, 15, 0, implicit $exec
- ; GFX11-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 15
- ; GFX11-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_4]]
- ; GFX11-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 16
- ; GFX11-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READLANE_B32_]], [[S_MOV_B32_5]], [[V_MOV_B32_dpp5]]
- ; GFX11-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 31
- ; GFX11-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_6]]
+ ; GFX11-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 15
+ ; GFX11-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_3]]
+ ; GFX11-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; GFX11-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READLANE_B32_]], [[S_MOV_B32_4]], [[V_MOV_B32_dpp5]]
+ ; GFX11-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 31
+ ; GFX11-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_5]]
; GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_1]]
; GFX11-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY13]], implicit $exec
; GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
@@ -207,7 +206,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.3
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: bb.3 (%ir-block.36):
+ ; GFX11-NEXT: bb.3 (%ir-block.29):
; GFX11-NEXT: successors: %bb.5(0x80000000)
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -217,11 +216,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: bb.4.Flow:
; GFX11-NEXT: successors: %bb.6(0x80000000)
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %41, %bb.5, [[DEF]], %bb.1
+ ; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %40, %bb.5, [[DEF]], %bb.1
; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.6
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: bb.5 (%ir-block.39):
+ ; GFX11-NEXT: bb.5 (%ir-block.32):
; GFX11-NEXT: successors: %bb.4(0x80000000)
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.3, [[DEF]], %bb.2
@@ -232,7 +231,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY15]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.4
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: bb.6 (%ir-block.47):
+ ; GFX11-NEXT: bb.6 (%ir-block.37):
; GFX11-NEXT: $vgpr0 = COPY [[PHI]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll
index b058ad1023e13..8ad91f001bd72 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll
@@ -1,249 +1,1219 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefixes=GFX90A,GFX90A_ITERATIVE %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefixes=GFX90A,GFX90A_DPP %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefixes=GFX940,GFX940_ITERATIVE %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefixes=GFX940,GFX940_DPP %s
define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret double %ret
}
define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
+ ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret double %ret
}
define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret double %ret
}
define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
+ ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret double %ret
}
define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
ret double %ret
}
define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
+ ; GFX90A_ITERATIVE: bb.1 (%ir-block.0):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000)
+ ; GFX90A_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
+ ; GFX90A_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.2
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.2 (%ir-block.5):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.7(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.7
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.3 (%ir-block.7):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.4(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], %25, [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.4.Flow:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.6(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: SI_END_CF %35, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.6
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.5 (%ir-block.9):
+ ; GFX90A_ITERATIVE-NEXT: S_ENDPGM 0
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.6.Flow1:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.5(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.5
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.7.ComputeLoop:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %17, %bb.7, [[S_MOV_B]], %bb.2
+ ; GFX90A_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI %22, %bb.7, [[COPY4]], %bb.2
+ ; GFX90A_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY5]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_FFBL_B32_e64_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY6]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FFBL_B32_e64_1]], [[V_MOV_B32_e32_1]], 0, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_FFBL_B32_e64_]], [[V_ADD_U32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY2]], [[V_READFIRSTLANE_B32_]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY3]], [[V_READFIRSTLANE_B32_1]]
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY7]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY8]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY9]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY10]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY11]], [[V_NOT_B32_e32_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY12]], [[V_NOT_B32_e32_1]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE2]], [[COPY13]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: $vcc = COPY [[V_CMP_NE_U64_e64_]]
+ ; GFX90A_ITERATIVE-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.8
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.8.ComputeEnd:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.7
+ ; GFX90A_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY16]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE3]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY14]]
+ ; GFX90A_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY18]], [[COPY19]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY17]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY20]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY21]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.3
+ ;
+ ; GFX90A_DPP-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
+ ; GFX90A_DPP: bb.1 (%ir-block.0):
+ ; GFX90A_DPP-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000)
+ ; GFX90A_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX90A_DPP-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A_DPP-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
+ ; GFX90A_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.2
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.2 (%ir-block.5):
+ ; GFX90A_DPP-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX90A_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX90A_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX90A_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+ ; GFX90A_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+ ; GFX90A_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX90A_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX90A_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY9]], [[COPY10]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX90A_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY11]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX90A_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[REG_SEQUENCE1]], [[COPY12]], implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY13]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY14]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY15]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY16]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY17]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY18:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY18]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX90A_DPP-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0
+ ; GFX90A_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1
+ ; GFX90A_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY19]], [[S_MOV_B32_2]]
+ ; GFX90A_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY20]], [[S_MOV_B32_2]]
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+ ; GFX90A_DPP-NEXT: [[STRICT_WWM:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[COPY21]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX90A_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY22]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.3
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.3 (%ir-block.31):
+ ; GFX90A_DPP-NEXT: successors: %bb.4(0x80000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A_DPP-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.4.Flow:
+ ; GFX90A_DPP-NEXT: successors: %bb.5(0x80000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.5 (%ir-block.33):
+ ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
+ ; GFX940_ITERATIVE: bb.1 (%ir-block.0):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000)
+ ; GFX940_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX940_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
+ ; GFX940_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.2
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.2 (%ir-block.5):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.7(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.7
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.3 (%ir-block.7):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.4(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], %24, [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.4.Flow:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.6(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: SI_END_CF %34, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.6
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.5 (%ir-block.9):
+ ; GFX940_ITERATIVE-NEXT: S_ENDPGM 0
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.6.Flow1:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.5(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.5
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.7.ComputeLoop:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %16, %bb.7, [[S_MOV_B]], %bb.2
+ ; GFX940_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI %21, %bb.7, [[COPY4]], %bb.2
+ ; GFX940_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY5]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_FFBL_B32_e64_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY6]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FFBL_B32_e64_1]], [[V_MOV_B32_e32_1]], 0, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_FFBL_B32_e64_]], [[V_ADD_U32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY2]], [[V_READFIRSTLANE_B32_]]
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY3]], [[V_READFIRSTLANE_B32_1]]
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY7]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+ ; GFX940_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]]
+ ; GFX940_ITERATIVE-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY8]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY9]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY10]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY11]], [[V_NOT_B32_e32_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY12]], [[V_NOT_B32_e32_1]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+ ; GFX940_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]]
+ ; GFX940_ITERATIVE-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE2]], [[COPY13]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: $vcc = COPY [[V_CMP_NE_U64_e64_]]
+ ; GFX940_ITERATIVE-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.8
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.8.ComputeEnd:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.7
+ ; GFX940_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY16]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE3]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY14]]
+ ; GFX940_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY18]], [[COPY19]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY17]]
+ ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY20]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX940_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY21]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.3
+ ;
+ ; GFX940_DPP-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
+ ; GFX940_DPP: bb.1 (%ir-block.0):
+ ; GFX940_DPP-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000)
+ ; GFX940_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX940_DPP-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940_DPP-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
+ ; GFX940_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.2
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.2 (%ir-block.5):
+ ; GFX940_DPP-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX940_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX940_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX940_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+ ; GFX940_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+ ; GFX940_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX940_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX940_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY9]], [[COPY10]], implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX940_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY11]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX940_DPP-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX940_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[REG_SEQUENCE1]], [[COPY12]], implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY13]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY14]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY15]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY16]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY17]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY18:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY18]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX940_DPP-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0
+ ; GFX940_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1
+ ; GFX940_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY19]], [[S_MOV_B32_2]]
+ ; GFX940_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY20]], [[S_MOV_B32_2]]
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+ ; GFX940_DPP-NEXT: [[STRICT_WWM:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[COPY21]], implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX940_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY22]], implicit $exec
+ ; GFX940_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.3
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.3 (%ir-block.31):
+ ; GFX940_DPP-NEXT: successors: %bb.4(0x80000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940_DPP-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.4.Flow:
+ ; GFX940_DPP-NEXT: successors: %bb.5(0x80000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.5 (%ir-block.33):
+ ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX90A_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
+ ; GFX90A_ITERATIVE: bb.1 (%ir-block.0):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000)
+ ; GFX90A_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
+ ; GFX90A_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.2
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.2 (%ir-block.5):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.7(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.7
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.3 (%ir-block.7):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.4(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], %28, [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.4 (%ir-block.9):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.6(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.3, [[DEF]], %bb.8
+ ; GFX90A_ITERATIVE-NEXT: SI_END_CF %38, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY7]], 0, %27, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.6
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.5 (%ir-block.13):
+ ; GFX90A_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY %43.sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %43.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX90A_ITERATIVE-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.6.Flow:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.5(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.4, [[DEF]], %bb.1
+ ; GFX90A_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.5
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.7.ComputeLoop:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI %19, %bb.7, [[S_MOV_B]], %bb.2
+ ; GFX90A_ITERATIVE-NEXT: [[PHI3:%[0-9]+]]:vreg_64_align2 = PHI %18, %bb.7, [[DEF]], %bb.2
+ ; GFX90A_ITERATIVE-NEXT: [[PHI4:%[0-9]+]]:vreg_64_align2 = PHI %24, %bb.7, [[COPY4]], %bb.2
+ ; GFX90A_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY10]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_FFBL_B32_e64_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY11]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FFBL_B32_e64_1]], [[V_MOV_B32_e32_1]], 0, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_FFBL_B32_e64_]], [[V_ADD_U32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY2]], [[V_READFIRSTLANE_B32_4]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY3]], [[V_READFIRSTLANE_B32_5]]
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_7]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_6]], $m0, [[COPY14]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_9]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_8]], $m0, [[COPY15]]
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_]], %subreg.sub0, [[V_WRITELANE_B32_1]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY16]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY17]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY18]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY19]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY20]], [[V_NOT_B32_e32_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY21]], [[V_NOT_B32_e32_1]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY22:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE4]], [[COPY22]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: $vcc = COPY [[V_CMP_NE_U64_e64_]]
+ ; GFX90A_ITERATIVE-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.8
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.8.ComputeEnd:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI5:%[0-9]+]]:vreg_64_align2 = PHI [[REG_SEQUENCE3]], %bb.7
+ ; GFX90A_ITERATIVE-NEXT: [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_1]], %bb.7
+ ; GFX90A_ITERATIVE-NEXT: [[COPY23:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY24:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY25:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY25]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY26:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE5]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[COPY23]]
+ ; GFX90A_ITERATIVE-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY27]], [[COPY28]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[COPY26]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY29]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY30]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.3
+ ;
+ ; GFX90A_DPP-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
+ ; GFX90A_DPP: bb.1 (%ir-block.0):
+ ; GFX90A_DPP-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+ ; GFX90A_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX90A_DPP-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A_DPP-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX90A_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
+ ; GFX90A_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.2
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.2 (%ir-block.5):
+ ; GFX90A_DPP-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX90A_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX90A_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX90A_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+ ; GFX90A_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+ ; GFX90A_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX90A_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX90A_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY9]], [[COPY10]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX90A_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY11]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX90A_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[REG_SEQUENCE1]], [[COPY12]], implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY13]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY14]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY15]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY16]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY17]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY18:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY18]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY19:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX90A_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY19]], [[V_ADD_F64_e64_5]], 312, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX90A_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0
+ ; GFX90A_DPP-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1
+ ; GFX90A_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY20]], [[S_MOV_B32_2]]
+ ; GFX90A_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY21]], [[S_MOV_B32_2]]
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[COPY22:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+ ; GFX90A_DPP-NEXT: [[STRICT_WWM:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[COPY22]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX90A_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY23]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.3
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.3 (%ir-block.32):
+ ; GFX90A_DPP-NEXT: successors: %bb.5(0x80000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.5
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.4.Flow:
+ ; GFX90A_DPP-NEXT: successors: %bb.6(0x80000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %44, %bb.5, [[DEF]], %bb.1
+ ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.6
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.5 (%ir-block.35):
+ ; GFX90A_DPP-NEXT: successors: %bb.4(0x80000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.3, [[DEF]], %bb.2
+ ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
+ ; GFX90A_DPP-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
+ ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[STRICT_WWM1:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[V_MOV_B6]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE4]]
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_6:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY26]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.4
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.6 (%ir-block.40):
+ ; GFX90A_DPP-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+ ; GFX90A_DPP-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+ ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY27]], implicit $exec
+ ; GFX90A_DPP-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY28]], implicit $exec
+ ; GFX90A_DPP-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX90A_DPP-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX940_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
+ ; GFX940_ITERATIVE: bb.1 (%ir-block.0):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000)
+ ; GFX940_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX940_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
+ ; GFX940_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.2
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.2 (%ir-block.5):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.7(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.7
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.3 (%ir-block.7):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.4(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], %27, [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.4 (%ir-block.9):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.6(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.3, [[DEF]], %bb.8
+ ; GFX940_ITERATIVE-NEXT: SI_END_CF %37, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY7]], 0, %26, 0, 0, implicit $mode, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.6
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.5 (%ir-block.13):
+ ; GFX940_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY %42.sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %42.sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX940_ITERATIVE-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.6.Flow:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.5(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.4, [[DEF]], %bb.1
+ ; GFX940_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.5
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.7.ComputeLoop:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI %18, %bb.7, [[S_MOV_B]], %bb.2
+ ; GFX940_ITERATIVE-NEXT: [[PHI3:%[0-9]+]]:vreg_64_align2 = PHI %17, %bb.7, [[DEF]], %bb.2
+ ; GFX940_ITERATIVE-NEXT: [[PHI4:%[0-9]+]]:vreg_64_align2 = PHI %23, %bb.7, [[COPY4]], %bb.2
+ ; GFX940_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY10]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_FFBL_B32_e64_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY11]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FFBL_B32_e64_1]], [[V_MOV_B32_e32_1]], 0, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_FFBL_B32_e64_]], [[V_ADD_U32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY2]], [[V_READFIRSTLANE_B32_4]]
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY3]], [[V_READFIRSTLANE_B32_5]]
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_7]]
+ ; GFX940_ITERATIVE-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_6]], $m0, [[COPY14]]
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_9]]
+ ; GFX940_ITERATIVE-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_8]], $m0, [[COPY15]]
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_]], %subreg.sub0, [[V_WRITELANE_B32_1]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]]
+ ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY16]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+ ; GFX940_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]]
+ ; GFX940_ITERATIVE-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY17]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY18]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY19]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY20]], [[V_NOT_B32_e32_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY21]], [[V_NOT_B32_e32_1]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+ ; GFX940_ITERATIVE-NEXT: [[COPY22:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]]
+ ; GFX940_ITERATIVE-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE4]], [[COPY22]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: $vcc = COPY [[V_CMP_NE_U64_e64_]]
+ ; GFX940_ITERATIVE-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.8
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.8.ComputeEnd:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI5:%[0-9]+]]:vreg_64_align2 = PHI [[REG_SEQUENCE3]], %bb.7
+ ; GFX940_ITERATIVE-NEXT: [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_1]], %bb.7
+ ; GFX940_ITERATIVE-NEXT: [[COPY23:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY24:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY25:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY25]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY26:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE5]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_ITERATIVE-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[COPY23]]
+ ; GFX940_ITERATIVE-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY27]], [[COPY28]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[COPY26]]
+ ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY29]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX940_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY30]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.3
+ ;
+ ; GFX940_DPP-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
+ ; GFX940_DPP: bb.1 (%ir-block.0):
+ ; GFX940_DPP-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+ ; GFX940_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX940_DPP-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940_DPP-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX940_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
+ ; GFX940_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.2
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.2 (%ir-block.5):
+ ; GFX940_DPP-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX940_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX940_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+ ; GFX940_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+ ; GFX940_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+ ; GFX940_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX940_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX940_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY9]], [[COPY10]], implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX940_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY11]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX940_DPP-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX940_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[REG_SEQUENCE1]], [[COPY12]], implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY13]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY14]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY15]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY16]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY17]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY18:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY18]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY19:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]]
+ ; GFX940_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY19]], [[V_ADD_F64_e64_5]], 312, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX940_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0
+ ; GFX940_DPP-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1
+ ; GFX940_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY20]], [[S_MOV_B32_2]]
+ ; GFX940_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY21]], [[S_MOV_B32_2]]
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[COPY22:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+ ; GFX940_DPP-NEXT: [[STRICT_WWM:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[COPY22]], implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX940_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY23]], implicit $exec
+ ; GFX940_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.3
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.3 (%ir-block.32):
+ ; GFX940_DPP-NEXT: successors: %bb.5(0x80000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.5
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.4.Flow:
+ ; GFX940_DPP-NEXT: successors: %bb.6(0x80000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %43, %bb.5, [[DEF]], %bb.1
+ ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.6
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.5 (%ir-block.35):
+ ; GFX940_DPP-NEXT: successors: %bb.4(0x80000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.3, [[DEF]], %bb.2
+ ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
+ ; GFX940_DPP-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
+ ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec
+ ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[STRICT_WWM1:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[V_MOV_B6]], implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE4]]
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_6:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY26]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.4
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.6 (%ir-block.40):
+ ; GFX940_DPP-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+ ; GFX940_DPP-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+ ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY27]], implicit $exec
+ ; GFX940_DPP-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY28]], implicit $exec
+ ; GFX940_DPP-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]]
+ ; GFX940_DPP-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
ret double %ret
}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 8ee0ee3b27bae..f0cec54691d5d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2,12 +2,12 @@
; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX1264 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX1232 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232 %s
declare i32 @llvm.amdgcn.workitem.id.x()
@@ -1058,13 +1058,13 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s5, v0
; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
@@ -1095,10 +1095,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX89-NEXT: .LBB3_2:
; GFX89-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: v_readfirstlane_b32 s2, v0
-; GFX89-NEXT: v_readfirstlane_b32 s3, v1
-; GFX89-NEXT: v_mov_b32_e32 v0, s2
-; GFX89-NEXT: v_mov_b32_e32 v1, s3
+; GFX89-NEXT: v_readfirstlane_b32 s2, v1
+; GFX89-NEXT: v_readfirstlane_b32 s3, v0
+; GFX89-NEXT: v_mov_b32_e32 v0, s3
+; GFX89-NEXT: v_mov_b32_e32 v1, s2
; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
; GFX89-NEXT: s_mov_b32 s3, 0xf000
; GFX89-NEXT: s_mov_b32 s2, -1
@@ -1134,8 +1134,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_mov_b32 s2, -1
@@ -1169,8 +1169,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_mov_b32 s2, -1
@@ -1205,8 +1205,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1164-NEXT: .LBB3_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
@@ -1242,8 +1242,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1132-NEXT: .LBB3_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
@@ -1281,8 +1281,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: .LBB3_2:
; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
; GFX1264-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[2:3]
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
@@ -1318,8 +1318,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: .LBB3_2:
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[2:3]
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
@@ -1367,15 +1367,15 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s6, -1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0
-; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v0
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2
; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2
; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2
; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s3
-; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s2
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s3, v2
; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX7LESS-NEXT: s_endpgm
@@ -1407,10 +1407,10 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB4_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_readfirstlane_b32 s3, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s3, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v3, s1, v2
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s0, v2, v[0:1]
@@ -1449,10 +1449,10 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: .LBB4_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1]
; GFX9-NEXT: s_mov_b32 s7, 0xf000
@@ -1493,8 +1493,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1064-NEXT: .LBB4_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
; GFX1064-NEXT: s_mov_b32 s6, -1
@@ -1534,8 +1534,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1032-NEXT: .LBB4_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032-NEXT: s_mov_b32 s6, -1
@@ -1576,8 +1576,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB4_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_mov_b32 s6, -1
@@ -1622,8 +1622,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB4_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_mov_b32 s6, -1
@@ -1666,8 +1666,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB4_2:
; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
; GFX1264-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
; GFX1264-NEXT: s_mov_b32 s6, -1
@@ -1706,8 +1706,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB4_2:
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232-NEXT: s_mov_b32 s6, -1
@@ -1744,87 +1744,440 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX7LESS-NEXT: s_endpgm
;
-; GFX89-LABEL: add_i64_varying:
-; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
-; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
-; GFX89-NEXT: v_mov_b32_e32 v1, 0
-; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
-; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_wbinvl1_vol
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX89-NEXT: s_endpgm
+; GFX8-LABEL: add_i64_varying:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: v_mov_b32_e32 v3, 0
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX8-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX8-NEXT: s_mov_b32 m0, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s6
+; GFX8-NEXT: v_readlane_b32 s7, v3, s6
+; GFX8-NEXT: v_writelane_b32 v1, s4, m0
+; GFX8-NEXT: s_add_u32 s4, s4, s8
+; GFX8-NEXT: v_writelane_b32 v2, s5, m0
+; GFX8-NEXT: s_addc_u32 s5, s5, s7
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX8-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execz .LBB5_4
+; GFX8-NEXT: ; %bb.3:
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: s_mov_b32 s11, 0xf000
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s8, s2
+; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: .LBB5_4:
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_readfirstlane_b32 s4, v4
+; GFX8-NEXT: v_readfirstlane_b32 s5, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s5, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: add_i64_varying:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX9-NEXT: s_mov_b32 m0, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s6
+; GFX9-NEXT: v_readlane_b32 s7, v3, s6
+; GFX9-NEXT: v_writelane_b32 v1, s4, m0
+; GFX9-NEXT: s_add_u32 s4, s4, s8
+; GFX9-NEXT: v_writelane_b32 v2, s5, m0
+; GFX9-NEXT: s_addc_u32 s5, s5, s7
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX9-NEXT: s_cbranch_execz .LBB5_4
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s2
+; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: v_mov_b32_e32 v4, s5
+; GFX9-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: .LBB5_4:
+; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: v_readfirstlane_b32 s4, v4
+; GFX9-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v2, vcc
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: add_i64_varying:
+; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_mov_b64 s[4:5], 0
+; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX1064-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1064-NEXT: v_writelane_b32 v1, s4, s6
+; GFX1064-NEXT: v_writelane_b32 v2, s5, s6
+; GFX1064-NEXT: s_add_u32 s4, s4, s7
+; GFX1064-NEXT: s_addc_u32 s5, s5, s8
+; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1064-NEXT: s_cbranch_execz .LBB5_4
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: v_mov_b32_e32 v3, s4
+; GFX1064-NEXT: v_mov_b32_e32 v4, s5
+; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s10, -1
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_mov_b32 s8, s2
+; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: buffer_gl1_inv
+; GFX1064-NEXT: buffer_gl0_inv
+; GFX1064-NEXT: .LBB5_4:
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1064-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1064-NEXT: v_add_co_u32 v0, vcc, s2, v1
+; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: add_i64_varying:
+; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032-NEXT: s_mov_b64 s[4:5], 0
+; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1032-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1032-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1032-NEXT: s_add_u32 s4, s4, s6
+; GFX1032-NEXT: s_addc_u32 s5, s5, s7
+; GFX1032-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1032-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s6, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1032-NEXT: s_cbranch_execz .LBB5_4
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: v_mov_b32_e32 v3, s4
+; GFX1032-NEXT: v_mov_b32_e32 v4, s5
+; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s10, -1
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_mov_b32 s8, s2
+; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: buffer_gl1_inv
+; GFX1032-NEXT: buffer_gl0_inv
+; GFX1032-NEXT: .LBB5_4:
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1032-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: add_i64_varying:
+; GFX1164: ; %bb.0: ; %entry
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[4:5], 0
+; GFX1164-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_ctz_i32_b64 s6, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1164-NEXT: v_writelane_b32 v1, s4, s6
+; GFX1164-NEXT: v_writelane_b32 v2, s5, s6
+; GFX1164-NEXT: s_add_u32 s4, s4, s7
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_addc_u32 s5, s5, s8
+; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1164-NEXT: s_cbranch_execz .LBB5_4
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: v_mov_b32_e32 v3, s4
+; GFX1164-NEXT: v_mov_b32_e32 v4, s5
+; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s10, -1
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_mov_b32 s8, s2
+; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], 0 glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: buffer_gl1_inv
+; GFX1164-NEXT: buffer_gl0_inv
+; GFX1164-NEXT: .LBB5_4:
+; GFX1164-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: v_add_co_u32 v0, vcc, s2, v1
+; GFX1164-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: s_nop 0
+; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164-NEXT: s_endpgm
;
-; GFX10-LABEL: add_i64_varying:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s11, s7
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
-; GFX10-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_mov_b32 s5, s1
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: add_i64_varying:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-LABEL: add_i64_varying:
-; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s11, s7
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_mov_b32 s5, s1
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
+; GFX1132-LABEL: add_i64_varying:
+; GFX1132: ; %bb.0: ; %entry
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b64 s[4:5], 0
+; GFX1132-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1132-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1132-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1132-NEXT: s_add_u32 s4, s4, s6
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_addc_u32 s5, s5, s7
+; GFX1132-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s6, exec_lo
+; GFX1132-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1132-NEXT: s_cbranch_execz .LBB5_4
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
+; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s10, -1
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_mov_b32 s8, s2
+; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], 0 glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: buffer_gl1_inv
+; GFX1132-NEXT: buffer_gl0_inv
+; GFX1132-NEXT: .LBB5_4:
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1132-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1132-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: s_nop 0
+; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132-NEXT: s_endpgm
+;
+; GFX1264-LABEL: add_i64_varying:
+; GFX1264: ; %bb.0: ; %entry
+; GFX1264-NEXT: v_mov_b32_e32 v3, 0
+; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_mov_b64 s[4:5], 0
+; GFX1264-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1264-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1264-NEXT: s_ctz_i32_b64 s10, s[2:3]
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1264-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1264-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1264-NEXT: v_writelane_b32 v2, s5, s10
+; GFX1264-NEXT: v_writelane_b32 v1, s4, s10
+; GFX1264-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1264-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1264-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1264-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1264-NEXT: s_mov_b64 s[6:7], exec
+; GFX1264-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1264-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1264-NEXT: s_cbranch_execz .LBB5_4
+; GFX1264-NEXT: ; %bb.3:
+; GFX1264-NEXT: v_mov_b32_e32 v3, s4
+; GFX1264-NEXT: v_mov_b32_e32 v4, s5
+; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s10, -1
+; GFX1264-NEXT: s_wait_kmcnt 0x0
+; GFX1264-NEXT: s_mov_b32 s8, s2
+; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1264-NEXT: s_wait_loadcnt 0x0
+; GFX1264-NEXT: global_inv scope:SCOPE_DEV
+; GFX1264-NEXT: .LBB5_4:
+; GFX1264-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1264-NEXT: s_wait_kmcnt 0x0
+; GFX1264-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1264-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264-NEXT: v_add_co_u32 v0, vcc, s2, v1
+; GFX1264-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s2, -1
+; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1264-NEXT: s_nop 0
+; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1264-NEXT: s_endpgm
+;
+; GFX1232-LABEL: add_i64_varying:
+; GFX1232: ; %bb.0: ; %entry
+; GFX1232-NEXT: v_mov_b32_e32 v3, 0
+; GFX1232-NEXT: s_mov_b32 s2, exec_lo
+; GFX1232-NEXT: s_mov_b64 s[4:5], 0
+; GFX1232-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1232-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1232-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1232-NEXT: s_lshl_b32 s8, 1, s3
+; GFX1232-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1232-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1232-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1232-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1232-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1232-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1232-NEXT: s_mov_b32 s6, exec_lo
+; GFX1232-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1232-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1232-NEXT: s_cbranch_execz .LBB5_4
+; GFX1232-NEXT: ; %bb.3:
+; GFX1232-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
+; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s10, -1
+; GFX1232-NEXT: s_wait_kmcnt 0x0
+; GFX1232-NEXT: s_mov_b32 s8, s2
+; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1232-NEXT: s_wait_loadcnt 0x0
+; GFX1232-NEXT: global_inv scope:SCOPE_DEV
+; GFX1232-NEXT: .LBB5_4:
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1232-NEXT: s_wait_kmcnt 0x0
+; GFX1232-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1232-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1232-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s2, -1
+; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1232-NEXT: s_nop 0
+; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1232-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%zext = zext i32 %lane to i64
@@ -2925,13 +3278,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
+; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s5, v0
; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
@@ -2961,12 +3314,12 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB9_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: v_readfirstlane_b32 s5, v0
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
@@ -2999,12 +3352,12 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: .LBB9_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: v_readfirstlane_b32 s5, v0
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@@ -3291,15 +3644,15 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s6, -1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0
-; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v0
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2
; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2
; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2
; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s3
-; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s2
+; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s3, v2
; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX7LESS-NEXT: s_endpgm
@@ -3334,11 +3687,11 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2
; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s1, v2
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
@@ -3379,11 +3732,11 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
; GFX9-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s1, v3
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
@@ -3689,87 +4042,440 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX7LESS-NEXT: s_endpgm
;
-; GFX89-LABEL: sub_i64_varying:
-; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
-; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
-; GFX89-NEXT: v_mov_b32_e32 v1, 0
-; GFX89-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
-; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_wbinvl1_vol
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX89-NEXT: s_endpgm
+; GFX8-LABEL: sub_i64_varying:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: v_mov_b32_e32 v3, 0
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX8-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX8-NEXT: s_mov_b32 m0, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s6
+; GFX8-NEXT: v_readlane_b32 s7, v3, s6
+; GFX8-NEXT: v_writelane_b32 v1, s4, m0
+; GFX8-NEXT: s_add_u32 s4, s4, s8
+; GFX8-NEXT: v_writelane_b32 v2, s5, m0
+; GFX8-NEXT: s_addc_u32 s5, s5, s7
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX8-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execz .LBB11_4
+; GFX8-NEXT: ; %bb.3:
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: s_mov_b32 s11, 0xf000
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s8, s2
+; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: .LBB11_4:
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_readfirstlane_b32 s4, v4
+; GFX8-NEXT: v_readfirstlane_b32 s5, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: sub_i64_varying:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX9-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX9-NEXT: s_mov_b32 m0, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s6
+; GFX9-NEXT: v_readlane_b32 s7, v3, s6
+; GFX9-NEXT: v_writelane_b32 v1, s4, m0
+; GFX9-NEXT: s_add_u32 s4, s4, s8
+; GFX9-NEXT: v_writelane_b32 v2, s5, m0
+; GFX9-NEXT: s_addc_u32 s5, s5, s7
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX9-NEXT: s_cbranch_execz .LBB11_4
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s2
+; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: v_mov_b32_e32 v4, s5
+; GFX9-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: .LBB11_4:
+; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: v_readfirstlane_b32 s4, v4
+; GFX9-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v2, vcc
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: sub_i64_varying:
+; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_mov_b64 s[4:5], 0
+; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX1064-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1064-NEXT: v_writelane_b32 v1, s4, s6
+; GFX1064-NEXT: v_writelane_b32 v2, s5, s6
+; GFX1064-NEXT: s_add_u32 s4, s4, s7
+; GFX1064-NEXT: s_addc_u32 s5, s5, s8
+; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1064-NEXT: s_cbranch_execz .LBB11_4
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: v_mov_b32_e32 v3, s4
+; GFX1064-NEXT: v_mov_b32_e32 v4, s5
+; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s10, -1
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_mov_b32 s8, s2
+; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: buffer_gl1_inv
+; GFX1064-NEXT: buffer_gl0_inv
+; GFX1064-NEXT: .LBB11_4:
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1064-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v1
+; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: sub_i64_varying:
+; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032-NEXT: s_mov_b64 s[4:5], 0
+; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1032-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1032-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1032-NEXT: s_add_u32 s4, s4, s6
+; GFX1032-NEXT: s_addc_u32 s5, s5, s7
+; GFX1032-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1032-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s6, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1032-NEXT: s_cbranch_execz .LBB11_4
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: v_mov_b32_e32 v3, s4
+; GFX1032-NEXT: v_mov_b32_e32 v4, s5
+; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s10, -1
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_mov_b32 s8, s2
+; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: buffer_gl1_inv
+; GFX1032-NEXT: buffer_gl0_inv
+; GFX1032-NEXT: .LBB11_4:
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1032-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
+; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: sub_i64_varying:
+; GFX1164: ; %bb.0: ; %entry
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[4:5], 0
+; GFX1164-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_ctz_i32_b64 s6, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1164-NEXT: v_writelane_b32 v1, s4, s6
+; GFX1164-NEXT: v_writelane_b32 v2, s5, s6
+; GFX1164-NEXT: s_add_u32 s4, s4, s7
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_addc_u32 s5, s5, s8
+; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1164-NEXT: s_cbranch_execz .LBB11_4
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: v_mov_b32_e32 v3, s4
+; GFX1164-NEXT: v_mov_b32_e32 v4, s5
+; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s10, -1
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_mov_b32 s8, s2
+; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], 0 glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: buffer_gl1_inv
+; GFX1164-NEXT: buffer_gl0_inv
+; GFX1164-NEXT: .LBB11_4:
+; GFX1164-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v1
+; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: s_nop 0
+; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164-NEXT: s_endpgm
;
-; GFX10-LABEL: sub_i64_varying:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s11, s7
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
-; GFX10-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_mov_b32 s5, s1
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: sub_i64_varying:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-LABEL: sub_i64_varying:
-; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s11, s7
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_mov_b32 s5, s1
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
+; GFX1132-LABEL: sub_i64_varying:
+; GFX1132: ; %bb.0: ; %entry
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b64 s[4:5], 0
+; GFX1132-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1132-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1132-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1132-NEXT: s_add_u32 s4, s4, s6
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_addc_u32 s5, s5, s7
+; GFX1132-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s6, exec_lo
+; GFX1132-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1132-NEXT: s_cbranch_execz .LBB11_4
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
+; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s10, -1
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_mov_b32 s8, s2
+; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], 0 glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: buffer_gl1_inv
+; GFX1132-NEXT: buffer_gl0_inv
+; GFX1132-NEXT: .LBB11_4:
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1132-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
+; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: s_nop 0
+; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132-NEXT: s_endpgm
+;
+; GFX1264-LABEL: sub_i64_varying:
+; GFX1264: ; %bb.0: ; %entry
+; GFX1264-NEXT: v_mov_b32_e32 v3, 0
+; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_mov_b64 s[4:5], 0
+; GFX1264-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1264-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1264-NEXT: s_ctz_i32_b64 s10, s[2:3]
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1264-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1264-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1264-NEXT: v_writelane_b32 v2, s5, s10
+; GFX1264-NEXT: v_writelane_b32 v1, s4, s10
+; GFX1264-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1264-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1264-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1264-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1264-NEXT: s_mov_b64 s[6:7], exec
+; GFX1264-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1264-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1264-NEXT: s_cbranch_execz .LBB11_4
+; GFX1264-NEXT: ; %bb.3:
+; GFX1264-NEXT: v_mov_b32_e32 v3, s4
+; GFX1264-NEXT: v_mov_b32_e32 v4, s5
+; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s10, -1
+; GFX1264-NEXT: s_wait_kmcnt 0x0
+; GFX1264-NEXT: s_mov_b32 s8, s2
+; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1264-NEXT: s_wait_loadcnt 0x0
+; GFX1264-NEXT: global_inv scope:SCOPE_DEV
+; GFX1264-NEXT: .LBB11_4:
+; GFX1264-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1264-NEXT: s_wait_kmcnt 0x0
+; GFX1264-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1264-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s2, v1
+; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s2, -1
+; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1264-NEXT: s_nop 0
+; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1264-NEXT: s_endpgm
+;
+; GFX1232-LABEL: sub_i64_varying:
+; GFX1232: ; %bb.0: ; %entry
+; GFX1232-NEXT: v_mov_b32_e32 v3, 0
+; GFX1232-NEXT: s_mov_b32 s2, exec_lo
+; GFX1232-NEXT: s_mov_b64 s[4:5], 0
+; GFX1232-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1232-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1232-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1232-NEXT: s_lshl_b32 s8, 1, s3
+; GFX1232-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1232-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1232-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1232-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1232-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1232-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1232-NEXT: s_mov_b32 s6, exec_lo
+; GFX1232-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1232-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1232-NEXT: s_cbranch_execz .LBB11_4
+; GFX1232-NEXT: ; %bb.3:
+; GFX1232-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
+; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s10, -1
+; GFX1232-NEXT: s_wait_kmcnt 0x0
+; GFX1232-NEXT: s_mov_b32 s8, s2
+; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1232-NEXT: s_wait_loadcnt 0x0
+; GFX1232-NEXT: global_inv scope:SCOPE_DEV
+; GFX1232-NEXT: .LBB11_4:
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1232-NEXT: s_wait_kmcnt 0x0
+; GFX1232-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1232-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
+; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s2, -1
+; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1232-NEXT: s_nop 0
+; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1232-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%zext = zext i32 %lane to i64
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index af6f69130910d..453bd07647c73 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -2,10 +2,10 @@
; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
declare i32 @llvm.amdgcn.workitem.id.x()
@@ -944,12 +944,12 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0
; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s5, v0
; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -974,11 +974,11 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB4_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_readfirstlane_b32 s3, v1
+; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s3, v0
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
@@ -1005,11 +1005,11 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB4_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
-; GFX9-NEXT: v_readfirstlane_b32 s3, v1
+; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s3, v0
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@@ -1039,8 +1039,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_mov_b32 s2, -1
@@ -1068,8 +1068,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_mov_b32 s2, -1
@@ -1099,8 +1099,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: .LBB4_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
@@ -1132,8 +1132,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: .LBB4_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
@@ -1182,14 +1182,14 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s4, s0
; GFX7LESS-NEXT: s_mov_b32 s5, s1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0
-; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v0
; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2
; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2
; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2
; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1
-; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s0
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s1, v2
; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX7LESS-NEXT: s_endpgm
@@ -1217,10 +1217,10 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB5_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: v_readfirstlane_b32 s5, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s5
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
@@ -1256,10 +1256,10 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB5_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: v_readfirstlane_b32 s5, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
; GFX9-NEXT: s_mov_b32 s7, 0xf000
@@ -1296,8 +1296,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX1064-NEXT: .LBB5_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
; GFX1064-NEXT: v_readfirstlane_b32 s5, v1
+; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5]
; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
@@ -1331,8 +1331,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX1032-NEXT: .LBB5_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT: v_readfirstlane_b32 s4, v0
; GFX1032-NEXT: v_readfirstlane_b32 s5, v1
+; GFX1032-NEXT: v_readfirstlane_b32 s4, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5]
; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2]
@@ -1367,8 +1367,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB5_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1164-NEXT: v_readfirstlane_b32 s4, v0
; GFX1164-NEXT: v_readfirstlane_b32 s5, v1
+; GFX1164-NEXT: v_readfirstlane_b32 s4, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
@@ -1407,8 +1407,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB5_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT: v_readfirstlane_b32 s4, v0
; GFX1132-NEXT: v_readfirstlane_b32 s5, v1
+; GFX1132-NEXT: v_readfirstlane_b32 s4, v0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
@@ -1445,56 +1445,301 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
;
; GFX8-LABEL: add_i64_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: v_mov_b32_e32 v3, 0
+; GFX8-NEXT: s_mov_b64 s[2:3], 0
+; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX8-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX8-NEXT: s_mov_b32 m0, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s6
+; GFX8-NEXT: v_readlane_b32 s7, v3, s6
+; GFX8-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8-NEXT: s_add_u32 s2, s2, s8
+; GFX8-NEXT: v_writelane_b32 v2, s3, m0
+; GFX8-NEXT: s_addc_u32 s3, s3, s7
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execz .LBB6_4
+; GFX8-NEXT: ; %bb.3:
+; GFX8-NEXT: v_mov_b32_e32 v4, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
+; GFX8-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: .LBB6_4:
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s4, v4
+; GFX8-NEXT: v_readfirstlane_b32 s5, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s5, v1
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i64_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX9-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX9-NEXT: s_mov_b32 m0, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s6
+; GFX9-NEXT: v_readlane_b32 s7, v3, s6
+; GFX9-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9-NEXT: s_add_u32 s2, s2, s8
+; GFX9-NEXT: v_writelane_b32 v2, s3, m0
+; GFX9-NEXT: s_addc_u32 s3, s3, s7
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX9-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execz .LBB6_4
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: v_mov_b32_e32 v4, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: .LBB6_4:
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s4, v4
+; GFX9-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v1
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v2, vcc
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
-; GFX10-LABEL: add_i64_varying:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: add_i64_varying:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX1064-LABEL: add_i64_varying:
+; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX1064-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1064-NEXT: v_writelane_b32 v1, s2, s6
+; GFX1064-NEXT: v_writelane_b32 v2, s3, s6
+; GFX1064-NEXT: s_add_u32 s2, s2, s7
+; GFX1064-NEXT: s_addc_u32 s3, s3, s8
+; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064-NEXT: s_cbranch_execz .LBB6_4
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: v_mov_b32_e32 v4, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s2
+; GFX1064-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: buffer_gl0_inv
+; GFX1064-NEXT: .LBB6_4:
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1064-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1064-NEXT: v_add_co_u32 v0, vcc, s2, v1
+; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: add_i64_varying:
+; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032-NEXT: s_mov_b64 s[2:3], 0
+; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1032-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1032-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1032-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1032-NEXT: s_add_u32 s2, s2, s6
+; GFX1032-NEXT: s_addc_u32 s3, s3, s7
+; GFX1032-NEXT: s_lshl_b32 s5, 1, s5
+; GFX1032-NEXT: s_andn2_b32 s4, s4, s5
+; GFX1032-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1032-NEXT: s_cbranch_execz .LBB6_4
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: v_mov_b32_e32 v4, s3
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v3, s2
+; GFX1032-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: buffer_gl0_inv
+; GFX1032-NEXT: .LBB6_4:
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1032-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: add_i64_varying:
+; GFX1164: ; %bb.0: ; %entry
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1164-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_ctz_i32_b64 s6, s[4:5]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1164-NEXT: v_writelane_b32 v1, s2, s6
+; GFX1164-NEXT: v_writelane_b32 v2, s3, s6
+; GFX1164-NEXT: s_add_u32 s2, s2, s7
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_addc_u32 s3, s3, s8
+; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1164-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164-NEXT: s_cbranch_execz .LBB6_4
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: v_mov_b32_e32 v4, s3
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s2
+; GFX1164-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: buffer_gl0_inv
+; GFX1164-NEXT: .LBB6_4:
+; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: v_add_co_u32 v0, vcc, s2, v1
+; GFX1164-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: s_nop 0
+; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: add_i64_varying:
+; GFX1132: ; %bb.0: ; %entry
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: s_mov_b64 s[2:3], 0
+; GFX1132-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1132-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1132-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1132-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1132-NEXT: s_add_u32 s2, s2, s6
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_addc_u32 s3, s3, s7
+; GFX1132-NEXT: s_lshl_b32 s5, 1, s5
+; GFX1132-NEXT: s_and_not1_b32 s4, s4, s5
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1132-NEXT: s_cbranch_execz .LBB6_4
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: v_mov_b32_e32 v4, s3
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2
+; GFX1132-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: buffer_gl0_inv
+; GFX1132-NEXT: .LBB6_4:
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1132-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1132-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: s_nop 0
+; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%zext = zext i32 %lane to i64
@@ -2444,12 +2689,12 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0
; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
+; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s5, v0
; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -2475,12 +2720,12 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB11_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: v_readfirstlane_b32 s5, v0
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
@@ -2507,12 +2752,12 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB11_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: v_readfirstlane_b32 s5, v0
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
@@ -2696,14 +2941,14 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s4, s0
; GFX7LESS-NEXT: s_mov_b32 s5, s1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0
-; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v0
; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2
; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2
; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2
; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1
-; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s0
+; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v2
; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX7LESS-NEXT: s_endpgm
@@ -2736,11 +2981,11 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX8-NEXT: s_mov_b32 s5, s1
; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2
; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s1, v2
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
@@ -2776,11 +3021,11 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
; GFX9-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s1, v3
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
@@ -2972,56 +3217,301 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
;
; GFX8-LABEL: sub_i64_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: v_mov_b32_e32 v3, 0
+; GFX8-NEXT: s_mov_b64 s[2:3], 0
+; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX8-NEXT: .LBB13_1: ; %ComputeLoop
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX8-NEXT: s_mov_b32 m0, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s6
+; GFX8-NEXT: v_readlane_b32 s7, v3, s6
+; GFX8-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8-NEXT: s_add_u32 s2, s2, s8
+; GFX8-NEXT: v_writelane_b32 v2, s3, m0
+; GFX8-NEXT: s_addc_u32 s3, s3, s7
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8-NEXT: s_cbranch_scc1 .LBB13_1
+; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execz .LBB13_4
+; GFX8-NEXT: ; %bb.3:
+; GFX8-NEXT: v_mov_b32_e32 v4, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
+; GFX8-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: .LBB13_4:
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s4, v4
+; GFX8-NEXT: v_readfirstlane_b32 s5, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v1
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i64_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX9-NEXT: .LBB13_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX9-NEXT: s_mov_b32 m0, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s6
+; GFX9-NEXT: v_readlane_b32 s7, v3, s6
+; GFX9-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9-NEXT: s_add_u32 s2, s2, s8
+; GFX9-NEXT: v_writelane_b32 v2, s3, m0
+; GFX9-NEXT: s_addc_u32 s3, s3, s7
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX9-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB13_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execz .LBB13_4
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: v_mov_b32_e32 v4, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: .LBB13_4:
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s4, v4
+; GFX9-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v1
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v2, vcc
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
-; GFX10-LABEL: sub_i64_varying:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: sub_i64_varying:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX1064-LABEL: sub_i64_varying:
+; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1064-NEXT: .LBB13_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX1064-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1064-NEXT: v_writelane_b32 v1, s2, s6
+; GFX1064-NEXT: v_writelane_b32 v2, s3, s6
+; GFX1064-NEXT: s_add_u32 s2, s2, s7
+; GFX1064-NEXT: s_addc_u32 s3, s3, s8
+; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB13_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064-NEXT: s_cbranch_execz .LBB13_4
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: v_mov_b32_e32 v4, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s2
+; GFX1064-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: buffer_gl0_inv
+; GFX1064-NEXT: .LBB13_4:
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1064-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v1
+; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: sub_i64_varying:
+; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032-NEXT: s_mov_b64 s[2:3], 0
+; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1032-NEXT: .LBB13_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1032-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1032-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1032-NEXT: s_add_u32 s2, s2, s6
+; GFX1032-NEXT: s_addc_u32 s3, s3, s7
+; GFX1032-NEXT: s_lshl_b32 s5, 1, s5
+; GFX1032-NEXT: s_andn2_b32 s4, s4, s5
+; GFX1032-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB13_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1032-NEXT: s_cbranch_execz .LBB13_4
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: v_mov_b32_e32 v4, s3
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v3, s2
+; GFX1032-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: buffer_gl0_inv
+; GFX1032-NEXT: .LBB13_4:
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1032-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
+; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: sub_i64_varying:
+; GFX1164: ; %bb.0: ; %entry
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1164-NEXT: .LBB13_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_ctz_i32_b64 s6, s[4:5]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1164-NEXT: v_writelane_b32 v1, s2, s6
+; GFX1164-NEXT: v_writelane_b32 v2, s3, s6
+; GFX1164-NEXT: s_add_u32 s2, s2, s7
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_addc_u32 s3, s3, s8
+; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1164-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB13_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164-NEXT: s_cbranch_execz .LBB13_4
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: v_mov_b32_e32 v4, s3
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s2
+; GFX1164-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: buffer_gl0_inv
+; GFX1164-NEXT: .LBB13_4:
+; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v1
+; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: s_nop 0
+; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: sub_i64_varying:
+; GFX1132: ; %bb.0: ; %entry
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: s_mov_b64 s[2:3], 0
+; GFX1132-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1132-NEXT: .LBB13_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132-NEXT: v_readlane_b32 s7, v3, s5
+; GFX1132-NEXT: v_writelane_b32 v1, s2, s5
+; GFX1132-NEXT: v_writelane_b32 v2, s3, s5
+; GFX1132-NEXT: s_add_u32 s2, s2, s6
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_addc_u32 s3, s3, s7
+; GFX1132-NEXT: s_lshl_b32 s5, 1, s5
+; GFX1132-NEXT: s_and_not1_b32 s4, s4, s5
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB13_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s4, exec_lo, s4
+; GFX1132-NEXT: s_cbranch_execz .LBB13_4
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: v_mov_b32_e32 v4, s3
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2
+; GFX1132-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: buffer_gl0_inv
+; GFX1132-NEXT: .LBB13_4:
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1132-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
+; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: s_nop 0
+; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%zext = zext i32 %lane to i64
@@ -4149,8 +4639,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1
; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -4249,8 +4739,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
@@ -4280,8 +4770,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
@@ -4312,8 +4802,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: .LBB18_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -4345,8 +4835,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: .LBB18_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -4663,8 +5153,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2
; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -4763,8 +5253,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
@@ -4794,8 +5284,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
@@ -4826,8 +5316,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: .LBB20_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -4859,8 +5349,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: .LBB20_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -5177,8 +5667,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
@@ -5208,8 +5698,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB22_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
@@ -5240,8 +5730,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB22_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
@@ -5274,8 +5764,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
@@ -5305,8 +5795,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
@@ -5337,8 +5827,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: .LBB22_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -5370,8 +5860,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: .LBB22_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -5688,8 +6178,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
@@ -5719,8 +6209,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB24_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
@@ -5751,8 +6241,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB24_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
@@ -5785,8 +6275,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
@@ -5816,8 +6306,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
@@ -5848,8 +6338,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: .LBB24_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -5881,8 +6371,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: .LBB24_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
index ee0910b21f024..60c3328b08c6c 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
@@ -170,9 +170,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GFX908-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec
; GFX908-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
- ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
- ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], killed [[S_MOV_B32_1]], implicit-def dead $scc, implicit $exec
; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
+ ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
; GFX908-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX908-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
; GFX908-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
@@ -185,14 +184,14 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX908-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
; GFX908-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec
; GFX908-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec
- ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63
- ; GFX908-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_2]]
+ ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX908-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]]
; GFX908-NEXT: early-clobber %1:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec
; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
; GFX908-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX908-NEXT: S_BRANCH %bb.2
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: bb.2 (%ir-block.35):
+ ; GFX908-NEXT: bb.2 (%ir-block.31):
; GFX908-NEXT: successors: %bb.3(0x80000000)
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -204,7 +203,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: bb.4 (%ir-block.37):
+ ; GFX908-NEXT: bb.4 (%ir-block.33):
; GFX908-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX908-NEXT: S_ENDPGM 0
;
@@ -232,9 +231,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GFX90A_GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec
; GFX90A_GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
- ; GFX90A_GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
- ; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], killed [[S_MOV_B32_1]], implicit-def dead $scc, implicit $exec
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
+ ; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
@@ -247,14 +245,14 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec
- ; GFX90A_GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63
- ; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_2]]
+ ; GFX90A_GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]]
; GFX90A_GFX940-NEXT: early-clobber %1:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec
; GFX90A_GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_BRANCH %bb.2
; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: bb.2 (%ir-block.35):
+ ; GFX90A_GFX940-NEXT: bb.2 (%ir-block.31):
; GFX90A_GFX940-NEXT: successors: %bb.3(0x80000000)
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -266,7 +264,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: bb.4 (%ir-block.37):
+ ; GFX90A_GFX940-NEXT: bb.4 (%ir-block.33):
; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_ENDPGM 0
;
@@ -290,9 +288,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX11_GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $exec_lo
; GFX11_GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GFX11_GFX12-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[S_MOV_B32_]], implicit $exec
- ; GFX11_GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
- ; GFX11_GFX12-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], killed [[S_MOV_B32_1]], implicit-def dead $scc, implicit $exec
; GFX11_GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
+ ; GFX11_GFX12-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
; GFX11_GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 353, 15, 15, 0, implicit $exec
; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
; GFX11_GFX12-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 354, 15, 15, 0, implicit $exec
@@ -301,15 +298,15 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec
; GFX11_GFX12-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 360, 15, 15, 0, implicit $exec
; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
- ; GFX11_GFX12-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
- ; GFX11_GFX12-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[V_ADD_F32_e64_3]], 0, implicit $exec
+ ; GFX11_GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+ ; GFX11_GFX12-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_1]], 0, [[S_MOV_B32_1]], [[V_ADD_F32_e64_3]], 0, implicit $exec
; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_PERMLANEX16_B32_e64_]], 0, 0, implicit $mode, implicit $exec
; GFX11_GFX12-NEXT: early-clobber %1:vgpr_32 = STRICT_WWM killed [[V_ADD_F32_e64_4]], implicit $exec
; GFX11_GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
; GFX11_GFX12-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX11_GFX12-NEXT: S_BRANCH %bb.2
; GFX11_GFX12-NEXT: {{ $}}
- ; GFX11_GFX12-NEXT: bb.2 (%ir-block.28):
+ ; GFX11_GFX12-NEXT: bb.2 (%ir-block.24):
; GFX11_GFX12-NEXT: successors: %bb.3(0x80000000)
; GFX11_GFX12-NEXT: {{ $}}
; GFX11_GFX12-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -320,7 +317,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX11_GFX12-NEXT: {{ $}}
; GFX11_GFX12-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX11_GFX12-NEXT: {{ $}}
- ; GFX11_GFX12-NEXT: bb.4 (%ir-block.30):
+ ; GFX11_GFX12-NEXT: bb.4 (%ir-block.26):
; GFX11_GFX12-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX11_GFX12-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
index 3454e9d1019e5..8eb38344fc944 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
@@ -176,9 +176,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $exec_lo
; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[S_MOV_B32_]], implicit $exec
- ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
- ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], killed [[S_MOV_B32_1]], implicit-def dead $scc, implicit $exec
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
+ ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
@@ -187,24 +186,24 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
- ; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[V_ADD_F32_e64_3]], 0, implicit $exec
+ ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+ ; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_1]], 0, [[S_MOV_B32_1]], [[V_ADD_F32_e64_3]], 0, implicit $exec
; GFX11-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], killed [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 273, 15, 15, 0, implicit $exec
- ; GFX11-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 15
- ; GFX11-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_3]]
- ; GFX11-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 16
- ; GFX11-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed [[V_READLANE_B32_]], killed [[S_MOV_B32_4]], [[V_MOV_B32_dpp5]]
- ; GFX11-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 31
- ; GFX11-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_5]]
+ ; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 15
+ ; GFX11-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_2]]
+ ; GFX11-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; GFX11-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed [[V_READLANE_B32_]], killed [[S_MOV_B32_3]], [[V_MOV_B32_dpp5]]
+ ; GFX11-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 31
+ ; GFX11-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_4]]
; GFX11-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_1]], implicit $exec
; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
; GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.2
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: bb.2 (%ir-block.36):
+ ; GFX11-NEXT: bb.2 (%ir-block.29):
; GFX11-NEXT: successors: %bb.4(0x80000000)
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -219,17 +218,17 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.5
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: bb.4 (%ir-block.39):
+ ; GFX11-NEXT: bb.4 (%ir-block.32):
; GFX11-NEXT: successors: %bb.3(0x80000000)
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
- ; GFX11-NEXT: early-clobber %44:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
- ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: early-clobber %43:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
+ ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_READFIRSTLANE_B32_]], 0, killed %43, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.3
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: bb.5 (%ir-block.47):
+ ; GFX11-NEXT: bb.5 (%ir-block.37):
; GFX11-NEXT: $vgpr0 = COPY [[PHI]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
index 9d8b987d2ba68..60149b90cb048 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
@@ -1,255 +1,1199 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefixes=GFX90A,GFX90A_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefixes=GFX90A,GFX90A_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefixes=GFX940,GFX940_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefixes=GFX940,GFX940_DPP %s
define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic
+ ; GFX940: bb.0 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]]
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX90A-NEXT: $sgpr0 = COPY [[COPY6]]
+ ; GFX90A-NEXT: $sgpr1 = COPY [[COPY7]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic
+ ; GFX940: bb.0 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX940-NEXT: $sgpr0 = COPY [[COPY6]]
+ ; GFX940-NEXT: $sgpr1 = COPY [[COPY7]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret double %ret
}
define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic
+ ; GFX940: bb.0 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]]
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
+ ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
+ ; GFX90A-NEXT: $sgpr0 = COPY [[COPY5]]
+ ; GFX90A-NEXT: $sgpr1 = COPY [[COPY6]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic
+ ; GFX940: bb.0 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
+ ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
+ ; GFX940-NEXT: $sgpr0 = COPY [[COPY5]]
+ ; GFX940-NEXT: $sgpr1 = COPY [[COPY6]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret double %ret
}
define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic
+ ; GFX940: bb.0 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]]
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX90A-NEXT: $sgpr0 = COPY [[COPY6]]
+ ; GFX90A-NEXT: $sgpr1 = COPY [[COPY7]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic
+ ; GFX940: bb.0 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX940-NEXT: $sgpr0 = COPY [[COPY6]]
+ ; GFX940-NEXT: $sgpr1 = COPY [[COPY7]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret double %ret
}
define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic
+ ; GFX940: bb.0 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]]
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
+ ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
+ ; GFX90A-NEXT: $sgpr0 = COPY [[COPY5]]
+ ; GFX90A-NEXT: $sgpr1 = COPY [[COPY6]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic
+ ; GFX940: bb.0 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
+ ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
+ ; GFX940-NEXT: $sgpr0 = COPY [[COPY5]]
+ ; GFX940-NEXT: $sgpr1 = COPY [[COPY6]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret double %ret
}
define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw
+ ; GFX940: bb.0 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]]
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX90A-NEXT: $sgpr0 = COPY [[COPY6]]
+ ; GFX90A-NEXT: $sgpr1 = COPY [[COPY7]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ;
+ ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw
+ ; GFX940: bb.0 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX940-NEXT: $sgpr0 = COPY [[COPY6]]
+ ; GFX940-NEXT: $sgpr1 = COPY [[COPY7]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
ret double %ret
}
define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ ; GFX90A_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
+ ; GFX90A_ITERATIVE: bb.0 (%ir-block.0):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX90A_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX90A_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
+ ; GFX90A_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.1
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.1 (%ir-block.5):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.6(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX90A_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[COPY6]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.6
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.2 (%ir-block.7):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.3(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY %42
+ ; GFX90A_ITERATIVE-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], [[COPY8]], [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.3.Flow:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.5(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: SI_END_CF %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.5
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.4 (%ir-block.9):
+ ; GFX90A_ITERATIVE-NEXT: S_ENDPGM 0
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.5.Flow1:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.4(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.4
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.6.ComputeLoop:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[V_MOV_B]], %bb.1, %26, %bb.6
+ ; GFX90A_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:sreg_64 = PHI [[COPY7]], %bb.1, %5, %bb.6
+ ; GFX90A_ITERATIVE-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[PHI1]]
+ ; GFX90A_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY9]], [[S_FF1_I32_B64_]]
+ ; GFX90A_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY10]], [[S_FF1_I32_B64_]]
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY killed [[REG_SEQUENCE2]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+ ; GFX90A_ITERATIVE-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[S_MOV_B64_]], [[S_FF1_I32_B64_]], implicit-def dead $scc
+ ; GFX90A_ITERATIVE-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI1]], killed [[S_LSHL_B64_]], implicit-def dead $scc
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX90A_ITERATIVE-NEXT: S_CMP_LG_U64 [[S_ANDN2_B64_]], killed [[S_MOV_B64_1]], implicit-def $scc
+ ; GFX90A_ITERATIVE-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.7
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.7.ComputeEnd:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.6
+ ; GFX90A_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY13]], [[COPY14]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY12]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.2
+ ;
+ ; GFX90A_DPP-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
+ ; GFX90A_DPP: bb.0 (%ir-block.0):
+ ; GFX90A_DPP-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000)
+ ; GFX90A_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A_DPP-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A_DPP-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX90A_DPP-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX90A_DPP-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_DPP-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
+ ; GFX90A_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.1
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.1 (%ir-block.5):
+ ; GFX90A_DPP-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX90A_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub1
+ ; GFX90A_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub0
+ ; GFX90A_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX90A_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY8]], [[COPY9]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY7]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[COPY4]], [[V_MOV_B]], implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, killed [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, killed [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, killed [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, killed [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, killed [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, killed [[V_MOV_B6]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1
+ ; GFX90A_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX90A_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY10]], [[S_MOV_B32_1]]
+ ; GFX90A_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0
+ ; GFX90A_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY11]], [[S_MOV_B32_1]]
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: early-clobber %1:sreg_64 = STRICT_WWM killed [[REG_SEQUENCE2]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.2
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.2 (%ir-block.31):
+ ; GFX90A_DPP-NEXT: successors: %bb.3(0x80000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY %1
+ ; GFX90A_DPP-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], [[COPY12]], [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.3.Flow:
+ ; GFX90A_DPP-NEXT: successors: %bb.4(0x80000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.4 (%ir-block.33):
+ ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_ENDPGM 0
+ ;
+ ; GFX940_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
+ ; GFX940_ITERATIVE: bb.0 (%ir-block.0):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX940_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX940_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX940_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX940_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX940_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
+ ; GFX940_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.1
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.1 (%ir-block.5):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.6(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX940_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[COPY6]]
+ ; GFX940_ITERATIVE-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.6
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.2 (%ir-block.7):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.3(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY %41
+ ; GFX940_ITERATIVE-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], [[COPY8]], [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.3.Flow:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.5(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: SI_END_CF %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.5
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.4 (%ir-block.9):
+ ; GFX940_ITERATIVE-NEXT: S_ENDPGM 0
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.5.Flow1:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.4(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.4
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.6.ComputeLoop:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[V_MOV_B]], %bb.1, %25, %bb.6
+ ; GFX940_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:sreg_64 = PHI [[COPY7]], %bb.1, %5, %bb.6
+ ; GFX940_ITERATIVE-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[PHI1]]
+ ; GFX940_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY9]], [[S_FF1_I32_B64_]]
+ ; GFX940_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY10]], [[S_FF1_I32_B64_]]
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY killed [[REG_SEQUENCE2]]
+ ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+ ; GFX940_ITERATIVE-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[S_MOV_B64_]], [[S_FF1_I32_B64_]], implicit-def dead $scc
+ ; GFX940_ITERATIVE-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI1]], killed [[S_LSHL_B64_]], implicit-def dead $scc
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX940_ITERATIVE-NEXT: S_CMP_LG_U64 [[S_ANDN2_B64_]], killed [[S_MOV_B64_1]], implicit-def $scc
+ ; GFX940_ITERATIVE-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.7
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.7.ComputeEnd:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.6
+ ; GFX940_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY13]], [[COPY14]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY12]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.2
+ ;
+ ; GFX940_DPP-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
+ ; GFX940_DPP: bb.0 (%ir-block.0):
+ ; GFX940_DPP-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000)
+ ; GFX940_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940_DPP-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940_DPP-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX940_DPP-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX940_DPP-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940_DPP-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX940_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX940_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
+ ; GFX940_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.1
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.1 (%ir-block.5):
+ ; GFX940_DPP-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX940_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub1
+ ; GFX940_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub0
+ ; GFX940_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX940_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY8]], [[COPY9]], implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY7]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[COPY4]], [[V_MOV_B]], implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, killed [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, killed [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, killed [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, killed [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, killed [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, killed [[V_MOV_B6]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1
+ ; GFX940_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX940_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY10]], [[S_MOV_B32_1]]
+ ; GFX940_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0
+ ; GFX940_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY11]], [[S_MOV_B32_1]]
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1
+ ; GFX940_DPP-NEXT: early-clobber %1:sreg_64 = STRICT_WWM killed [[REG_SEQUENCE2]], implicit $exec
+ ; GFX940_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
+ ; GFX940_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.2
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.2 (%ir-block.31):
+ ; GFX940_DPP-NEXT: successors: %bb.3(0x80000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY %1
+ ; GFX940_DPP-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], [[COPY12]], [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.3.Flow:
+ ; GFX940_DPP-NEXT: successors: %bb.4(0x80000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.4 (%ir-block.33):
+ ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
ret void
}
define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]]
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ; GFX90A_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
+ ; GFX90A_ITERATIVE: bb.0 (%ir-block.0):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX90A_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX90A_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
+ ; GFX90A_ITERATIVE-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]]
+ ; GFX90A_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.1
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.1 (%ir-block.5):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.6(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX90A_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[COPY7]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[DEF3]]
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.6
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.2 (%ir-block.7):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.3(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY %68
+ ; GFX90A_ITERATIVE-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY10]], [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.3 (%ir-block.9):
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.5(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %77, %bb.7, [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.2
+ ; GFX90A_ITERATIVE-NEXT: SI_END_CF %14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, killed [[REG_SEQUENCE2]], 0, %12, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.5
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.4 (%ir-block.13):
+ ; GFX90A_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY %5.sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY %5.sub1
+ ; GFX90A_ITERATIVE-NEXT: $sgpr0 = COPY [[COPY13]]
+ ; GFX90A_ITERATIVE-NEXT: $sgpr1 = COPY [[COPY14]]
+ ; GFX90A_ITERATIVE-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.5.Flow:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.4(0x80000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY6]], %bb.0, [[V_ADD_F64_e64_]], %bb.3
+ ; GFX90A_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.4
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.6.ComputeLoop:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_MOV_B]], %bb.1, %42, %bb.6
+ ; GFX90A_ITERATIVE-NEXT: [[PHI3:%[0-9]+]]:vreg_64_align2 = PHI [[COPY9]], %bb.1, %9, %bb.6
+ ; GFX90A_ITERATIVE-NEXT: [[PHI4:%[0-9]+]]:sreg_64 = PHI [[COPY8]], %bb.1, %11, %bb.6
+ ; GFX90A_ITERATIVE-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[PHI4]]
+ ; GFX90A_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY15]], [[S_FF1_I32_B64_]]
+ ; GFX90A_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY16]], [[S_FF1_I32_B64_]]
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1
+ ; GFX90A_ITERATIVE-NEXT: $m0 = COPY [[S_FF1_I32_B64_]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY18]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_2]], $m0, [[COPY17]]
+ ; GFX90A_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0
+ ; GFX90A_ITERATIVE-NEXT: $m0 = COPY [[S_FF1_I32_B64_]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY20]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_3]], $m0, [[COPY19]]
+ ; GFX90A_ITERATIVE-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_1]], %subreg.sub0, [[V_WRITELANE_B32_]], %subreg.sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE4]]
+ ; GFX90A_ITERATIVE-NEXT: [[COPY22:%[0-9]+]]:sreg_64 = COPY killed [[REG_SEQUENCE3]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY22]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+ ; GFX90A_ITERATIVE-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[S_MOV_B64_]], [[S_FF1_I32_B64_]], implicit-def dead $scc
+ ; GFX90A_ITERATIVE-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI4]], killed [[S_LSHL_B64_]], implicit-def dead $scc
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX90A_ITERATIVE-NEXT: S_CMP_LG_U64 [[S_ANDN2_B64_]], killed [[S_MOV_B64_1]], implicit-def $scc
+ ; GFX90A_ITERATIVE-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.7
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: bb.7.ComputeEnd:
+ ; GFX90A_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX90A_ITERATIVE-NEXT: {{ $}}
+ ; GFX90A_ITERATIVE-NEXT: [[PHI5:%[0-9]+]]:vreg_64_align2 = PHI [[COPY21]], %bb.6
+ ; GFX90A_ITERATIVE-NEXT: [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_1]], %bb.6
+ ; GFX90A_ITERATIVE-NEXT: [[COPY23:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub1
+ ; GFX90A_ITERATIVE-NEXT: [[COPY24:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub0
+ ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_ITERATIVE-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY24]], [[COPY25]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY23]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX90A_ITERATIVE-NEXT: [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[DEF7]]
+ ; GFX90A_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.2
+ ;
+ ; GFX90A_DPP-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
+ ; GFX90A_DPP: bb.0 (%ir-block.0):
+ ; GFX90A_DPP-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
+ ; GFX90A_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A_DPP-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A_DPP-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX90A_DPP-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX90A_DPP-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_DPP-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
+ ; GFX90A_DPP-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX90A_DPP-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]]
+ ; GFX90A_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.1
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.1 (%ir-block.5):
+ ; GFX90A_DPP-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX90A_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1
+ ; GFX90A_DPP-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0
+ ; GFX90A_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX90A_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX90A_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY9]], [[COPY10]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY8]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[COPY4]], [[V_MOV_B]], implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, killed [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, killed [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, killed [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, killed [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, killed [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, killed [[V_MOV_B6]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_5]], 312, 15, 15, 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1
+ ; GFX90A_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX90A_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY11]], [[S_MOV_B32_1]]
+ ; GFX90A_DPP-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0
+ ; GFX90A_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY12]], [[S_MOV_B32_1]]
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: early-clobber %2:sreg_64 = STRICT_WWM killed [[REG_SEQUENCE2]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX90A_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[DEF3]]
+ ; GFX90A_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.2
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.2 (%ir-block.32):
+ ; GFX90A_DPP-NEXT: successors: %bb.4(0x80000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY %2
+ ; GFX90A_DPP-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY14]], [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.4
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.3.Flow:
+ ; GFX90A_DPP-NEXT: successors: %bb.5(0x80000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY6]], %bb.0, %7, %bb.4
+ ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.5
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.4 (%ir-block.35):
+ ; GFX90A_DPP-NEXT: successors: %bb.3(0x80000000)
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY13]], %bb.1, [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.2
+ ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
+ ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
+ ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY16]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; GFX90A_DPP-NEXT: early-clobber %55:vreg_64_align2 = STRICT_WWM [[V_MOV_B7]], implicit $exec
+ ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_6:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, killed [[REG_SEQUENCE3]], 0, killed %55, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A_DPP-NEXT: S_BRANCH %bb.3
+ ; GFX90A_DPP-NEXT: {{ $}}
+ ; GFX90A_DPP-NEXT: bb.5 (%ir-block.40):
+ ; GFX90A_DPP-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+ ; GFX90A_DPP-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+ ; GFX90A_DPP-NEXT: $sgpr0 = COPY [[COPY17]]
+ ; GFX90A_DPP-NEXT: $sgpr1 = COPY [[COPY18]]
+ ; GFX90A_DPP-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ;
+ ; GFX940_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
+ ; GFX940_ITERATIVE: bb.0 (%ir-block.0):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX940_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX940_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX940_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX940_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX940_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
+ ; GFX940_ITERATIVE-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]]
+ ; GFX940_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.1
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.1 (%ir-block.5):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.6(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+ ; GFX940_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[COPY7]]
+ ; GFX940_ITERATIVE-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[DEF3]]
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.6
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.2 (%ir-block.7):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.3(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY %67
+ ; GFX940_ITERATIVE-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY10]], [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.3 (%ir-block.9):
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.5(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %76, %bb.7, [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.2
+ ; GFX940_ITERATIVE-NEXT: SI_END_CF %14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, killed [[REG_SEQUENCE2]], 0, %12, 0, 0, implicit $mode, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.5
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.4 (%ir-block.13):
+ ; GFX940_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY %5.sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY %5.sub1
+ ; GFX940_ITERATIVE-NEXT: $sgpr0 = COPY [[COPY13]]
+ ; GFX940_ITERATIVE-NEXT: $sgpr1 = COPY [[COPY14]]
+ ; GFX940_ITERATIVE-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.5.Flow:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.4(0x80000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY6]], %bb.0, [[V_ADD_F64_e64_]], %bb.3
+ ; GFX940_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.4
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.6.ComputeLoop:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_MOV_B]], %bb.1, %41, %bb.6
+ ; GFX940_ITERATIVE-NEXT: [[PHI3:%[0-9]+]]:vreg_64_align2 = PHI [[COPY9]], %bb.1, %9, %bb.6
+ ; GFX940_ITERATIVE-NEXT: [[PHI4:%[0-9]+]]:sreg_64 = PHI [[COPY8]], %bb.1, %11, %bb.6
+ ; GFX940_ITERATIVE-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[PHI4]]
+ ; GFX940_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY15]], [[S_FF1_I32_B64_]]
+ ; GFX940_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY16]], [[S_FF1_I32_B64_]]
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1
+ ; GFX940_ITERATIVE-NEXT: $m0 = COPY [[S_FF1_I32_B64_]]
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY18]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_2]], $m0, [[COPY17]]
+ ; GFX940_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0
+ ; GFX940_ITERATIVE-NEXT: $m0 = COPY [[S_FF1_I32_B64_]]
+ ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY20]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_3]], $m0, [[COPY19]]
+ ; GFX940_ITERATIVE-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_1]], %subreg.sub0, [[V_WRITELANE_B32_]], %subreg.sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE4]]
+ ; GFX940_ITERATIVE-NEXT: [[COPY22:%[0-9]+]]:sreg_64 = COPY killed [[REG_SEQUENCE3]]
+ ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY22]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+ ; GFX940_ITERATIVE-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[S_MOV_B64_]], [[S_FF1_I32_B64_]], implicit-def dead $scc
+ ; GFX940_ITERATIVE-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI4]], killed [[S_LSHL_B64_]], implicit-def dead $scc
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX940_ITERATIVE-NEXT: S_CMP_LG_U64 [[S_ANDN2_B64_]], killed [[S_MOV_B64_1]], implicit-def $scc
+ ; GFX940_ITERATIVE-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.7
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: bb.7.ComputeEnd:
+ ; GFX940_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX940_ITERATIVE-NEXT: {{ $}}
+ ; GFX940_ITERATIVE-NEXT: [[PHI5:%[0-9]+]]:vreg_64_align2 = PHI [[COPY21]], %bb.6
+ ; GFX940_ITERATIVE-NEXT: [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_1]], %bb.6
+ ; GFX940_ITERATIVE-NEXT: [[COPY23:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub1
+ ; GFX940_ITERATIVE-NEXT: [[COPY24:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub0
+ ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_ITERATIVE-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY24]], [[COPY25]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY23]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
+ ; GFX940_ITERATIVE-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX940_ITERATIVE-NEXT: [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[DEF7]]
+ ; GFX940_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.2
+ ;
+ ; GFX940_DPP-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
+ ; GFX940_DPP: bb.0 (%ir-block.0):
+ ; GFX940_DPP-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
+ ; GFX940_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940_DPP-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940_DPP-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX940_DPP-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX940_DPP-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940_DPP-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX940_DPP-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX940_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX940_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
+ ; GFX940_DPP-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX940_DPP-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]]
+ ; GFX940_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.1
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.1 (%ir-block.5):
+ ; GFX940_DPP-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec
+ ; GFX940_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1
+ ; GFX940_DPP-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0
+ ; GFX940_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX940_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX940_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY9]], [[COPY10]], implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY8]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[COPY4]], [[V_MOV_B]], implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, killed [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, killed [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, killed [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, killed [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, killed [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, killed [[V_MOV_B6]], 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_5]], 312, 15, 15, 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1
+ ; GFX940_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX940_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY11]], [[S_MOV_B32_1]]
+ ; GFX940_DPP-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0
+ ; GFX940_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY12]], [[S_MOV_B32_1]]
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1
+ ; GFX940_DPP-NEXT: early-clobber %2:sreg_64 = STRICT_WWM killed [[REG_SEQUENCE2]], implicit $exec
+ ; GFX940_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
+ ; GFX940_DPP-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX940_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[DEF3]]
+ ; GFX940_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.2
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.2 (%ir-block.32):
+ ; GFX940_DPP-NEXT: successors: %bb.4(0x80000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY %2
+ ; GFX940_DPP-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY14]], [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.4
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.3.Flow:
+ ; GFX940_DPP-NEXT: successors: %bb.5(0x80000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY6]], %bb.0, %7, %bb.4
+ ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.5
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.4 (%ir-block.35):
+ ; GFX940_DPP-NEXT: successors: %bb.3(0x80000000)
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY13]], %bb.1, [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.2
+ ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
+ ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec
+ ; GFX940_DPP-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
+ ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY16]], implicit $exec
+ ; GFX940_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; GFX940_DPP-NEXT: early-clobber %54:vreg_64_align2 = STRICT_WWM [[V_MOV_B7]], implicit $exec
+ ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_6:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, killed [[REG_SEQUENCE3]], 0, killed %54, 0, 0, implicit $mode, implicit $exec
+ ; GFX940_DPP-NEXT: S_BRANCH %bb.3
+ ; GFX940_DPP-NEXT: {{ $}}
+ ; GFX940_DPP-NEXT: bb.5 (%ir-block.40):
+ ; GFX940_DPP-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+ ; GFX940_DPP-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+ ; GFX940_DPP-NEXT: $sgpr0 = COPY [[COPY17]]
+ ; GFX940_DPP-NEXT: $sgpr1 = COPY [[COPY18]]
+ ; GFX940_DPP-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
ret double %ret
}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
index 722c069f90a8c..166865b9b866f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
@@ -9,7 +9,7 @@
define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 {
; IR-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]]
+; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]]
; IR: 2:
; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -28,16 +28,14 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_agent_scope_uns
; IR-NEXT: br label [[TMP16]]
; IR: 16:
; IR-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
-; IR-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]])
-; IR-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
-; IR-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float
-; IR-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]]
-; IR-NEXT: [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]]
-; IR-NEXT: br label [[TMP24]]
-; IR: 24:
-; IR-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ]
-; IR-NEXT: ret float [[TMP25]]
+; IR-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP17]])
+; IR-NEXT: [[TMP19:%.*]] = uitofp i32 [[TMP8]] to float
+; IR-NEXT: [[TMP20:%.*]] = fmul float [[VAL]], [[TMP19]]
+; IR-NEXT: [[TMP21:%.*]] = fadd float [[TMP18]], [[TMP20]]
+; IR-NEXT: br label [[TMP22]]
+; IR: 22:
+; IR-NEXT: [[TMP23:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ]
+; IR-NEXT: ret float [[TMP23]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
ret float %result
@@ -46,7 +44,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_agent_scope_uns
define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float %val) #0 {
; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]]
; IR-ITERATIVE: 2:
; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -57,44 +55,37 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco
; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
; IR-ITERATIVE: 10:
-; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP29:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP22:%.*]] syncscope("agent") monotonic, align 4
; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
-; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]])
-; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = fadd float [[TMP16]], [[TMP28:%.*]]
-; IR-ITERATIVE-NEXT: br label [[TMP18]]
-; IR-ITERATIVE: 18:
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ]
-; IR-ITERATIVE-NEXT: ret float [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP13]])
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = fadd float [[TMP14]], [[TMP21:%.*]]
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret float [[TMP17]]
; IR-ITERATIVE: ComputeLoop:
-; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP29]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP28]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP32:%.*]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]])
-; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
-; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
-; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]])
-; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float
-; IR-ITERATIVE-NEXT: [[TMP29]] = fadd float [[ACCUMULATOR]], [[TMP24]]
-; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]]
-; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], -1
-; IR-ITERATIVE-NEXT: [[TMP32]] = and i64 [[ACTIVEBITS]], [[TMP31]]
-; IR-ITERATIVE-NEXT: [[TMP33:%.*]] = icmp eq i64 [[TMP32]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP33]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP19]])
+; IR-ITERATIVE-NEXT: [[TMP21]] = call float @llvm.amdgcn.writelane.f32(float [[ACCUMULATOR]], i32 [[TMP19]], float [[OLDVALUEPHI]])
+; IR-ITERATIVE-NEXT: [[TMP22]] = fadd float [[ACCUMULATOR]], [[TMP20]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1
+; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]]
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
; IR-ITERATIVE: ComputeEnd:
-; IR-ITERATIVE-NEXT: [[TMP34:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP34]], label [[TMP10]], label [[TMP12]]
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]]
;
; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
; IR-DPP: 2:
; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -102,43 +93,36 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco
; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648)
-; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float
-; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float
-; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP14:%.*]] = fadd float [[TMP11]], [[TMP13]]
-; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP16:%.*]] = fadd float [[TMP14]], [[TMP15]]
-; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP18:%.*]] = fadd float [[TMP16]], [[TMP17]]
-; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP20:%.*]] = fadd float [[TMP18]], [[TMP19]]
-; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP20]], [[TMP21]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP24:%.*]] = fadd float [[TMP22]], [[TMP23]]
-; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63)
-; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
-; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]])
-; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT: br i1 [[TMP30]], label [[TMP31:%.*]], label [[TMP33:%.*]]
-; IR-DPP: 31:
-; IR-DPP-NEXT: [[TMP32:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP29]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float -0.000000e+00)
+; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP11:%.*]] = fadd float [[TMP9]], [[TMP10]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 274, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP13:%.*]] = fadd float [[TMP11]], [[TMP12]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP13]], i32 276, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP15:%.*]] = fadd float [[TMP13]], [[TMP14]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP15]], i32 280, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP17]], i32 322, i32 10, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP19:%.*]] = fadd float [[TMP17]], [[TMP18]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP19]], i32 323, i32 12, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP21:%.*]] = fadd float [[TMP19]], [[TMP20]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP21]], i32 312, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63)
+; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP23]])
+; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 26:
+; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP24]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: [[TMP29:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ]
+; IR-DPP-NEXT: [[TMP30:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP29]])
+; IR-DPP-NEXT: [[TMP31:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]])
+; IR-DPP-NEXT: [[TMP32:%.*]] = fadd float [[TMP30]], [[TMP31]]
; IR-DPP-NEXT: br label [[TMP33]]
; IR-DPP: 33:
-; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
-; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]])
-; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
-; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]])
-; IR-DPP-NEXT: [[TMP39:%.*]] = fadd float [[TMP37]], [[TMP38]]
-; IR-DPP-NEXT: br label [[TMP40]]
-; IR-DPP: 40:
-; IR-DPP-NEXT: [[TMP41:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP39]], [[TMP33]] ]
-; IR-DPP-NEXT: ret float [[TMP41]]
+; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ]
+; IR-DPP-NEXT: ret float [[TMP34]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
ret float %result
@@ -147,7 +131,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco
define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1 {
; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7:[0-9]+]]
-; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]]
; IR-ITERATIVE: 2:
; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -166,20 +150,18 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_un
; IR-ITERATIVE-NEXT: br label [[TMP16]]
; IR-ITERATIVE: 16:
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
-; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP22]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br label [[TMP24]]
-; IR-ITERATIVE: 24:
-; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ]
-; IR-ITERATIVE-NEXT: ret float [[TMP25]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP17]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP22]]
+; IR-ITERATIVE: 22:
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ]
+; IR-ITERATIVE-NEXT: ret float [[TMP23]]
;
; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8:[0-9]+]]
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]]
; IR-DPP: 2:
; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -198,16 +180,14 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_un
; IR-DPP-NEXT: br label [[TMP16]]
; IR-DPP: 16:
; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
-; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
-; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP22]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: br label [[TMP24]]
-; IR-DPP: 24:
-; IR-DPP-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ]
-; IR-DPP-NEXT: ret float [[TMP25]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP17]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: br label [[TMP22]]
+; IR-DPP: 22:
+; IR-DPP-NEXT: [[TMP23:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ]
+; IR-DPP-NEXT: ret float [[TMP23]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
ret float %result
@@ -216,7 +196,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_un
define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, float %val) #1 {
; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]]
; IR-ITERATIVE: 2:
; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -227,44 +207,37 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un
; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
; IR-ITERATIVE: 10:
-; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP29:%.*]] syncscope("one-as") monotonic, align 4
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP22:%.*]] syncscope("one-as") monotonic, align 4
; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
-; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br label [[TMP18]]
-; IR-ITERATIVE: 18:
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ]
-; IR-ITERATIVE-NEXT: ret float [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP13]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP14]], float [[TMP21:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret float [[TMP17]]
; IR-ITERATIVE: ComputeLoop:
-; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP29]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP28]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP32:%.*]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
-; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
-; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float
-; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]]
-; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], -1
-; IR-ITERATIVE-NEXT: [[TMP32]] = and i64 [[ACTIVEBITS]], [[TMP31]]
-; IR-ITERATIVE-NEXT: [[TMP33:%.*]] = icmp eq i64 [[TMP32]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP33]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP21]] = call float @llvm.amdgcn.writelane.f32(float [[ACCUMULATOR]], i32 [[TMP19]], float [[OLDVALUEPHI]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP22]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1
+; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]]
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
; IR-ITERATIVE: ComputeEnd:
-; IR-ITERATIVE-NEXT: [[TMP34:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP34]], label [[TMP10]], label [[TMP12]]
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]]
;
; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
; IR-DPP: 2:
; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -272,43 +245,36 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un
; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float
-; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float
-; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP14]], float [[TMP15]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP17]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
-; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT: br i1 [[TMP30]], label [[TMP31:%.*]], label [[TMP33:%.*]]
-; IR-DPP: 31:
-; IR-DPP-NEXT: [[TMP32:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP29]] syncscope("one-as") monotonic, align 4
+; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float -0.000000e+00) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP9]], float [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP13]], float [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP15]], float [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP17]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP19]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP23]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 26:
+; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP24]] syncscope("one-as") monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: [[TMP29:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ]
+; IR-DPP-NEXT: [[TMP30:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP29]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP31:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP32:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP30]], float [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: br label [[TMP33]]
; IR-DPP: 33:
-; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
-; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
-; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: br label [[TMP40]]
-; IR-DPP: 40:
-; IR-DPP-NEXT: [[TMP41:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP39]], [[TMP33]] ]
-; IR-DPP-NEXT: ret float [[TMP41]]
+; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ]
+; IR-DPP-NEXT: ret float [[TMP34]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
ret float %result
@@ -317,7 +283,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un
define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #2 {
; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]]
; IR-ITERATIVE: 2:
; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -336,20 +302,18 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str
; IR-ITERATIVE-NEXT: br label [[TMP16]]
; IR-ITERATIVE: 16:
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
-; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP22]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br label [[TMP24]]
-; IR-ITERATIVE: 24:
-; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ]
-; IR-ITERATIVE-NEXT: ret float [[TMP25]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP17]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP22]]
+; IR-ITERATIVE: 22:
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ]
+; IR-ITERATIVE-NEXT: ret float [[TMP23]]
;
; IR-DPP-LABEL: @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]]
; IR-DPP: 2:
; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -368,16 +332,14 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str
; IR-DPP-NEXT: br label [[TMP16]]
; IR-DPP: 16:
; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
-; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
-; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP22]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: br label [[TMP24]]
-; IR-DPP: 24:
-; IR-DPP-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ]
-; IR-DPP-NEXT: ret float [[TMP25]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP17]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: br label [[TMP22]]
+; IR-DPP: 22:
+; IR-DPP-NEXT: [[TMP23:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ]
+; IR-DPP-NEXT: ret float [[TMP23]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
ret float %result
@@ -386,7 +348,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str
define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 {
; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]]
; IR-ITERATIVE: 2:
; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -397,44 +359,37 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
; IR-ITERATIVE: 10:
-; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP29:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP22:%.*]] syncscope("agent") monotonic, align 4
; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
-; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br label [[TMP18]]
-; IR-ITERATIVE: 18:
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ]
-; IR-ITERATIVE-NEXT: ret float [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP13]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP14]], float [[TMP21:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret float [[TMP17]]
; IR-ITERATIVE: ComputeLoop:
-; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP29]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP28]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP32:%.*]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
-; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
-; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float
-; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]]
-; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], -1
-; IR-ITERATIVE-NEXT: [[TMP32]] = and i64 [[ACTIVEBITS]], [[TMP31]]
-; IR-ITERATIVE-NEXT: [[TMP33:%.*]] = icmp eq i64 [[TMP32]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP33]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP21]] = call float @llvm.amdgcn.writelane.f32(float [[ACCUMULATOR]], i32 [[TMP19]], float [[OLDVALUEPHI]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP22]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1
+; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]]
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
; IR-ITERATIVE: ComputeEnd:
-; IR-ITERATIVE-NEXT: [[TMP34:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP34]], label [[TMP10]], label [[TMP12]]
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]]
;
; IR-DPP-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
; IR-DPP: 2:
; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -442,43 +397,36 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float
-; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float
-; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP14]], float [[TMP15]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP17]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
-; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT: br i1 [[TMP30]], label [[TMP31:%.*]], label [[TMP33:%.*]]
-; IR-DPP: 31:
-; IR-DPP-NEXT: [[TMP32:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP29]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float -0.000000e+00) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP9]], float [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP13]], float [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP15]], float [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP17]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP19]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP23]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 26:
+; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP24]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: [[TMP29:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ]
+; IR-DPP-NEXT: [[TMP30:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP29]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP31:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP32:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP30]], float [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: br label [[TMP33]]
; IR-DPP: 33:
-; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
-; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
-; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: br label [[TMP40]]
-; IR-DPP: 40:
-; IR-DPP-NEXT: [[TMP41:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP39]], [[TMP33]] ]
-; IR-DPP-NEXT: ret float [[TMP41]]
+; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ]
+; IR-DPP-NEXT: ret float [[TMP34]]
;
%result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
ret float %result
@@ -487,7 +435,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
define amdgpu_ps float @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 {
; IR-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
+; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]]
; IR: 2:
; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -502,16 +450,14 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_uni_value_agent_scope_uns
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]])
-; IR-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
-; IR-NEXT: [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float
-; IR-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]]
-; IR-NEXT: [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP18]])
-; IR-NEXT: br label [[TMP20]]
-; IR: 20:
-; IR-NEXT: [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ]
-; IR-NEXT: ret float [[TMP21]]
+; IR-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP13]])
+; IR-NEXT: [[TMP15:%.*]] = uitofp i32 [[TMP8]] to float
+; IR-NEXT: [[TMP16:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]]
+; IR-NEXT: [[TMP17:%.*]] = call float @llvm.minnum.f32(float [[TMP14]], float [[TMP16]])
+; IR-NEXT: br label [[TMP18]]
+; IR: 18:
+; IR-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ]
+; IR-NEXT: ret float [[TMP19]]
;
%result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
ret float %result
@@ -520,7 +466,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_uni_value_agent_scope_uns
define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float %val) #0 {
; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]]
; IR-ITERATIVE: 2:
; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -531,44 +477,37 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns
; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
; IR-ITERATIVE: 10:
-; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[TMP29:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[TMP22:%.*]] syncscope("agent") monotonic, align 4
; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
-; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]])
-; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP28:%.*]])
-; IR-ITERATIVE-NEXT: br label [[TMP18]]
-; IR-ITERATIVE: 18:
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ]
-; IR-ITERATIVE-NEXT: ret float [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP13]])
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call float @llvm.minnum.f32(float [[TMP14]], float [[TMP21:%.*]])
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret float [[TMP17]]
; IR-ITERATIVE: ComputeLoop:
-; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0x7FF0000000000000, [[TMP2]] ], [ [[TMP29]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP28]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP32:%.*]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]])
-; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
-; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
-; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]])
-; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float
-; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.minnum.f32(float [[ACCUMULATOR]], float [[TMP24]])
-; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]]
-; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], -1
-; IR-ITERATIVE-NEXT: [[TMP32]] = and i64 [[ACTIVEBITS]], [[TMP31]]
-; IR-ITERATIVE-NEXT: [[TMP33:%.*]] = icmp eq i64 [[TMP32]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP33]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0x7FF0000000000000, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP19]])
+; IR-ITERATIVE-NEXT: [[TMP21]] = call float @llvm.amdgcn.writelane.f32(float [[ACCUMULATOR]], i32 [[TMP19]], float [[OLDVALUEPHI]])
+; IR-ITERATIVE-NEXT: [[TMP22]] = call float @llvm.minnum.f32(float [[ACCUMULATOR]], float [[TMP20]])
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1
+; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]]
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
; IR-ITERATIVE: ComputeEnd:
-; IR-ITERATIVE-NEXT: [[TMP34:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP34]], label [[TMP10]], label [[TMP12]]
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]]
;
; IR-DPP-LABEL: @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
; IR-DPP: 2:
; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -576,43 +515,36 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns
; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 2139095040)
-; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float
-; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float
-; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP11]], i32 273, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.minnum.f32(float [[TMP11]], float [[TMP13]])
-; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP14]], i32 274, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.minnum.f32(float [[TMP14]], float [[TMP15]])
-; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP16]], i32 276, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP17]])
-; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP18]], i32 280, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.minnum.f32(float [[TMP18]], float [[TMP19]])
-; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP20]], i32 322, i32 10, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.minnum.f32(float [[TMP20]], float [[TMP21]])
-; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP22]], i32 323, i32 12, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.minnum.f32(float [[TMP22]], float [[TMP23]])
-; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP24]], i32 312, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63)
-; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
-; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]])
-; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT: br i1 [[TMP30]], label [[TMP31:%.*]], label [[TMP33:%.*]]
-; IR-DPP: 31:
-; IR-DPP-NEXT: [[TMP32:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[TMP29]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float 0x7FF0000000000000)
+; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP9]], i32 273, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.minnum.f32(float [[TMP9]], float [[TMP10]])
+; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP11]], i32 274, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.minnum.f32(float [[TMP11]], float [[TMP12]])
+; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP13]], i32 276, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.minnum.f32(float [[TMP13]], float [[TMP14]])
+; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP15]], i32 280, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.minnum.f32(float [[TMP15]], float [[TMP16]])
+; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP17]], i32 322, i32 10, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP17]], float [[TMP18]])
+; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP19]], i32 323, i32 12, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.minnum.f32(float [[TMP19]], float [[TMP20]])
+; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP21]], i32 312, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63)
+; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP23]])
+; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 26:
+; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[TMP24]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: [[TMP29:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ]
+; IR-DPP-NEXT: [[TMP30:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP29]])
+; IR-DPP-NEXT: [[TMP31:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]])
+; IR-DPP-NEXT: [[TMP32:%.*]] = call float @llvm.minnum.f32(float [[TMP30]], float [[TMP31]])
; IR-DPP-NEXT: br label [[TMP33]]
; IR-DPP: 33:
-; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
-; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]])
-; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
-; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]])
-; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.minnum.f32(float [[TMP37]], float [[TMP38]])
-; IR-DPP-NEXT: br label [[TMP40]]
-; IR-DPP: 40:
-; IR-DPP-NEXT: [[TMP41:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP39]], [[TMP33]] ]
-; IR-DPP-NEXT: ret float [[TMP41]]
+; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ]
+; IR-DPP-NEXT: ret float [[TMP34]]
;
%result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
ret float %result
@@ -621,7 +553,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns
define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1{
; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]]
; IR-ITERATIVE: 2:
; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -636,20 +568,18 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_uns
; IR-ITERATIVE-NEXT: br label [[TMP12]]
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[VAL]]
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP16]], float [[TMP18]], metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br label [[TMP20]]
-; IR-ITERATIVE: 20:
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ]
-; IR-ITERATIVE-NEXT: ret float [[TMP21]]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP13]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[VAL]]
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP14]], float [[TMP16]], metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP18]]
+; IR-ITERATIVE: 18:
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret float [[TMP19]]
;
; IR-DPP-LABEL: @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]]
; IR-DPP: 2:
; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -664,16 +594,14 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_uns
; IR-DPP-NEXT: br label [[TMP12]]
; IR-DPP: 12:
; IR-DPP-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-DPP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
-; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[VAL]]
-; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP16]], float [[TMP18]], metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: br label [[TMP20]]
-; IR-DPP: 20:
-; IR-DPP-NEXT: [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ]
-; IR-DPP-NEXT: ret float [[TMP21]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP13]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[VAL]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP14]], float [[TMP16]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: br label [[TMP18]]
+; IR-DPP: 18:
+; IR-DPP-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ]
+; IR-DPP-NEXT: ret float [[TMP19]]
;
%result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
ret float %result
@@ -682,7 +610,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_uns
define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, float %val) #1{
; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]]
; IR-ITERATIVE: 2:
; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -693,44 +621,37 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns
; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
; IR-ITERATIVE: 10:
-; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[TMP29:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[TMP22:%.*]] syncscope("agent") monotonic, align 4
; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
-; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br label [[TMP18]]
-; IR-ITERATIVE: 18:
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ]
-; IR-ITERATIVE-NEXT: ret float [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP13]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP14]], float [[TMP21:%.*]], metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret float [[TMP17]]
; IR-ITERATIVE: ComputeLoop:
-; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0xFFF0000000000000, [[TMP2]] ], [ [[TMP29]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP28]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP32:%.*]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
-; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
-; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float
-; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.maxnum.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]]
-; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], -1
-; IR-ITERATIVE-NEXT: [[TMP32]] = and i64 [[ACTIVEBITS]], [[TMP31]]
-; IR-ITERATIVE-NEXT: [[TMP33:%.*]] = icmp eq i64 [[TMP32]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP33]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0xFFF0000000000000, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP21]] = call float @llvm.amdgcn.writelane.f32(float [[ACCUMULATOR]], i32 [[TMP19]], float [[OLDVALUEPHI]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP22]] = call float @llvm.experimental.constrained.maxnum.f32(float [[ACCUMULATOR]], float [[TMP20]], metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1
+; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]]
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
; IR-ITERATIVE: ComputeEnd:
-; IR-ITERATIVE-NEXT: [[TMP34:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP34]], label [[TMP10]], label [[TMP12]]
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]]
;
; IR-DPP-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
; IR-DPP: 2:
; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -738,43 +659,36 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns
; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -8388608) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float
-; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float
-; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP11]], float [[TMP13]], metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP14]], float [[TMP15]], metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP16]], float [[TMP17]], metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP18]], float [[TMP19]], metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP20]], float [[TMP21]], metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP22]], float [[TMP23]], metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
-; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT: br i1 [[TMP30]], label [[TMP31:%.*]], label [[TMP33:%.*]]
-; IR-DPP: 31:
-; IR-DPP-NEXT: [[TMP32:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[TMP29]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float 0xFFF0000000000000) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP9]], float [[TMP10]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP11]], float [[TMP12]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP13]], float [[TMP14]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP15]], float [[TMP16]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP17]], float [[TMP18]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP19]], float [[TMP20]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP23]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 26:
+; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[TMP24]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: [[TMP29:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ]
+; IR-DPP-NEXT: [[TMP30:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP29]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP31:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP32:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP30]], float [[TMP31]], metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: br label [[TMP33]]
; IR-DPP: 33:
-; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
-; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
-; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP37]], float [[TMP38]], metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: br label [[TMP40]]
-; IR-DPP: 40:
-; IR-DPP-NEXT: [[TMP41:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP39]], [[TMP33]] ]
-; IR-DPP-NEXT: ret float [[TMP41]]
+; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ]
+; IR-DPP-NEXT: ret float [[TMP34]]
;
%result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
ret float %result
@@ -783,7 +697,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns
define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #2 {
; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]]
; IR-ITERATIVE: 2:
; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -802,20 +716,18 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_st
; IR-ITERATIVE-NEXT: br label [[TMP16]]
; IR-ITERATIVE: 16:
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
-; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP22]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br label [[TMP24]]
-; IR-ITERATIVE: 24:
-; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ]
-; IR-ITERATIVE-NEXT: ret float [[TMP25]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP17]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP22]]
+; IR-ITERATIVE: 22:
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ]
+; IR-ITERATIVE-NEXT: ret float [[TMP23]]
;
; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]]
; IR-DPP: 2:
; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -834,16 +746,14 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_st
; IR-DPP-NEXT: br label [[TMP16]]
; IR-DPP: 16:
; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
-; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
-; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP22]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: br label [[TMP24]]
-; IR-DPP: 24:
-; IR-DPP-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ]
-; IR-DPP-NEXT: ret float [[TMP25]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP17]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: br label [[TMP22]]
+; IR-DPP: 22:
+; IR-DPP-NEXT: [[TMP23:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ]
+; IR-DPP-NEXT: ret float [[TMP23]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
ret float %result
@@ -852,7 +762,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_st
define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 {
; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_system_scope_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]]
; IR-ITERATIVE: 2:
; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -863,44 +773,37 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
; IR-ITERATIVE: 10:
-; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP29:%.*]] monotonic, align 4
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP22:%.*]] monotonic, align 4
; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
-; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br label [[TMP18]]
-; IR-ITERATIVE: 18:
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ]
-; IR-ITERATIVE-NEXT: ret float [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP13]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP14]], float [[TMP21:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret float [[TMP17]]
; IR-ITERATIVE: ComputeLoop:
-; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP29]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP28]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP32:%.*]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
-; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
-; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float
-; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]]
-; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], -1
-; IR-ITERATIVE-NEXT: [[TMP32]] = and i64 [[ACTIVEBITS]], [[TMP31]]
-; IR-ITERATIVE-NEXT: [[TMP33:%.*]] = icmp eq i64 [[TMP32]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP33]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP21]] = call float @llvm.amdgcn.writelane.f32(float [[ACCUMULATOR]], i32 [[TMP19]], float [[OLDVALUEPHI]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP22]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1
+; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]]
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
; IR-ITERATIVE: ComputeEnd:
-; IR-ITERATIVE-NEXT: [[TMP34:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP34]], label [[TMP10]], label [[TMP12]]
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]]
;
; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_system_scope_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
; IR-DPP: 2:
; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -908,43 +811,36 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float
-; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float
-; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP14]], float [[TMP15]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP17]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
-; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT: br i1 [[TMP30]], label [[TMP31:%.*]], label [[TMP33:%.*]]
-; IR-DPP: 31:
-; IR-DPP-NEXT: [[TMP32:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP29]] monotonic, align 4
+; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float -0.000000e+00) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP9]], float [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP13]], float [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP15]], float [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP17]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP19]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP23]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 26:
+; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP24]] monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: [[TMP29:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ]
+; IR-DPP-NEXT: [[TMP30:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[TMP29]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP31:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP32:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP30]], float [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: br label [[TMP33]]
; IR-DPP: 33:
-; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
-; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
-; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: br label [[TMP40]]
-; IR-DPP: 40:
-; IR-DPP-NEXT: [[TMP41:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP39]], [[TMP33]] ]
-; IR-DPP-NEXT: ret float [[TMP41]]
+; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ]
+; IR-DPP-NEXT: ret float [[TMP34]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
ret float %result
@@ -1061,7 +957,7 @@ define amdgpu_ps float @global_atomic_fadd_div_address_div_value_system_scope_st
define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double inreg %val) #0 {
; IR-LABEL: @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe(
; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]]
+; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]]
; IR: 2:
; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -1080,31 +976,101 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_agent_s
; IR-NEXT: br label [[TMP16]]
; IR: 16:
; IR-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
-; IR-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64
-; IR-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
-; IR-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
-; IR-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]])
-; IR-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]])
-; IR-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
-; IR-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
-; IR-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
-; IR-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP8]] to double
-; IR-NEXT: [[TMP28:%.*]] = fmul double [[VAL]], [[TMP27]]
-; IR-NEXT: [[TMP29:%.*]] = fadd double [[TMP26]], [[TMP28]]
-; IR-NEXT: br label [[TMP30]]
-; IR: 30:
-; IR-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ]
-; IR-NEXT: ret double [[TMP31]]
+; IR-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP17]])
+; IR-NEXT: [[TMP19:%.*]] = uitofp i32 [[TMP8]] to double
+; IR-NEXT: [[TMP20:%.*]] = fmul double [[VAL]], [[TMP19]]
+; IR-NEXT: [[TMP21:%.*]] = fadd double [[TMP18]], [[TMP20]]
+; IR-NEXT: br label [[TMP22]]
+; IR: 22:
+; IR-NEXT: [[TMP23:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ]
+; IR-NEXT: ret double [[TMP23]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4
ret double %result
}
define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 {
-; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-NEXT: ret double [[RESULT]]
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP22:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]])
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = fadd double [[TMP14]], [[TMP21:%.*]]
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret double [[TMP17]]
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP19]])
+; IR-ITERATIVE-NEXT: [[TMP21]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP19]], double [[OLDVALUEPHI]])
+; IR-ITERATIVE-NEXT: [[TMP22]] = fadd double [[ACCUMULATOR]], [[TMP20]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1
+; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]]
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00)
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP11:%.*]] = fadd double [[TMP9]], [[TMP10]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP13:%.*]] = fadd double [[TMP11]], [[TMP12]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP15:%.*]] = fadd double [[TMP13]], [[TMP14]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP17:%.*]] = fadd double [[TMP15]], [[TMP16]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP19:%.*]] = fadd double [[TMP17]], [[TMP18]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP21:%.*]] = fadd double [[TMP19]], [[TMP20]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP21]], i32 312, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63)
+; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]])
+; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 26:
+; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ]
+; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]])
+; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]])
+; IR-DPP-NEXT: [[TMP32:%.*]] = fadd double [[TMP30]], [[TMP31]]
+; IR-DPP-NEXT: br label [[TMP33]]
+; IR-DPP: 33:
+; IR-DPP-NEXT: [[TMP34:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ]
+; IR-DPP-NEXT: ret double [[TMP34]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4
ret double %result
@@ -1113,7 +1079,7 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_scope_a
define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1 {
; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]]
; IR-ITERATIVE: 2:
; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -1132,26 +1098,18 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_
; IR-ITERATIVE-NEXT: br label [[TMP16]]
; IR-ITERATIVE: 16:
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
-; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
-; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
-; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br label [[TMP30]]
-; IR-ITERATIVE: 30:
-; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ]
-; IR-ITERATIVE-NEXT: ret double [[TMP31]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP17]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP18]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP22]]
+; IR-ITERATIVE: 22:
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ]
+; IR-ITERATIVE-NEXT: ret double [[TMP23]]
;
; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]]
; IR-DPP: 2:
; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -1170,31 +1128,101 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_
; IR-DPP-NEXT: br label [[TMP16]]
; IR-DPP: 16:
; IR-DPP-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
-; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64
-; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
-; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
-; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
-; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
-; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
-; IR-DPP-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: br label [[TMP30]]
-; IR-DPP: 30:
-; IR-DPP-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ]
-; IR-DPP-NEXT: ret double [[TMP31]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP17]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP18]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: br label [[TMP22]]
+; IR-DPP: 22:
+; IR-DPP-NEXT: [[TMP23:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ]
+; IR-DPP-NEXT: ret double [[TMP23]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic
ret double %result
}
define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1 {
-; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8
-; IR-NEXT: ret double [[RESULT]]
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP22:%.*]] syncscope("one-as") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP14]], double [[TMP21:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret double [[TMP17]]
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP21]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP19]], double [[OLDVALUEPHI]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1
+; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]]
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 26:
+; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("one-as") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ]
+; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP30]], double [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: br label [[TMP33]]
+; IR-DPP: 33:
+; IR-DPP-NEXT: [[TMP34:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ]
+; IR-DPP-NEXT: ret double [[TMP34]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic
ret double %result
@@ -1203,7 +1231,7 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_one_as_
define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #2 {
; IR-ITERATIVE-LABEL: @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]]
; IR-ITERATIVE: 2:
; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -1222,26 +1250,18 @@ define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_s
; IR-ITERATIVE-NEXT: br label [[TMP16]]
; IR-ITERATIVE: 16:
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
-; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
-; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
-; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br label [[TMP30]]
-; IR-ITERATIVE: 30:
-; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ]
-; IR-ITERATIVE-NEXT: ret double [[TMP31]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP17]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP18]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP22]]
+; IR-ITERATIVE: 22:
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ]
+; IR-ITERATIVE-NEXT: ret double [[TMP23]]
;
; IR-DPP-LABEL: @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]]
; IR-DPP: 2:
; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -1260,31 +1280,101 @@ define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_s
; IR-DPP-NEXT: br label [[TMP16]]
; IR-DPP: 16:
; IR-DPP-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
-; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64
-; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
-; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
-; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
-; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
-; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
-; IR-DPP-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: br label [[TMP30]]
-; IR-DPP: 30:
-; IR-DPP-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ]
-; IR-DPP-NEXT: ret double [[TMP31]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP17]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP18]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: br label [[TMP22]]
+; IR-DPP: 22:
+; IR-DPP-NEXT: [[TMP23:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ]
+; IR-DPP-NEXT: ret double [[TMP23]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
ret double %result
}
define amdgpu_ps double @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 {
-; IR-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
-; IR-NEXT: ret double [[RESULT]]
+; IR-ITERATIVE-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[TMP22:%.*]] syncscope("agent") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP14]], double [[TMP21:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret double [[TMP17]]
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP21]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP19]], double [[OLDVALUEPHI]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1
+; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]]
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 26:
+; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("agent") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ]
+; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP30]], double [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: br label [[TMP33]]
+; IR-DPP: 33:
+; IR-DPP-NEXT: [[TMP34:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ]
+; IR-DPP-NEXT: ret double [[TMP34]]
;
%result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
ret double %result
@@ -1293,7 +1383,7 @@ define amdgpu_ps double @global_atomic_fsub_double_uni_address_div_value_agent_s
define amdgpu_ps double @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double inreg %val) #0 {
; IR-LABEL: @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe(
; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP26:%.*]]
+; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]]
; IR: 2:
; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -1308,31 +1398,101 @@ define amdgpu_ps double @global_atomic_fmin_double_uni_address_uni_value_agent_s
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = bitcast double [[TMP13]] to i64
-; IR-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
-; IR-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32
-; IR-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
-; IR-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]])
-; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]])
-; IR-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
-; IR-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1
-; IR-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double
-; IR-NEXT: [[TMP23:%.*]] = uitofp i32 [[TMP8]] to double
-; IR-NEXT: [[TMP24:%.*]] = select i1 [[TMP9]], double 0x7FF0000000000000, double [[VAL]]
-; IR-NEXT: [[TMP25:%.*]] = call double @llvm.minnum.f64(double [[TMP22]], double [[TMP24]])
-; IR-NEXT: br label [[TMP26]]
-; IR: 26:
-; IR-NEXT: [[TMP27:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP25]], [[TMP12]] ]
-; IR-NEXT: ret double [[TMP27]]
+; IR-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]])
+; IR-NEXT: [[TMP15:%.*]] = uitofp i32 [[TMP8]] to double
+; IR-NEXT: [[TMP16:%.*]] = select i1 [[TMP9]], double 0x7FF0000000000000, double [[VAL]]
+; IR-NEXT: [[TMP17:%.*]] = call double @llvm.minnum.f64(double [[TMP14]], double [[TMP16]])
+; IR-NEXT: br label [[TMP18]]
+; IR: 18:
+; IR-NEXT: [[TMP19:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ]
+; IR-NEXT: ret double [[TMP19]]
;
%result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
ret double %result
}
define amdgpu_ps double @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 {
-; IR-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
-; IR-NEXT: ret double [[RESULT]]
+; IR-ITERATIVE-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[TMP22:%.*]] syncscope("agent") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]])
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.minnum.f64(double [[TMP14]], double [[TMP21:%.*]])
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret double [[TMP17]]
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ 0x7FF0000000000000, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP19]])
+; IR-ITERATIVE-NEXT: [[TMP21]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP19]], double [[OLDVALUEPHI]])
+; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.minnum.f64(double [[ACCUMULATOR]], double [[TMP20]])
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1
+; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]]
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double 0x7FF0000000000000)
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP9]], i32 273, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.minnum.f64(double [[TMP9]], double [[TMP10]])
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP11]], i32 274, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.minnum.f64(double [[TMP11]], double [[TMP12]])
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP13]], i32 276, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.minnum.f64(double [[TMP13]], double [[TMP14]])
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP15]], i32 280, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.minnum.f64(double [[TMP15]], double [[TMP16]])
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP17]], i32 322, i32 10, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.minnum.f64(double [[TMP17]], double [[TMP18]])
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP19]], i32 323, i32 12, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.minnum.f64(double [[TMP19]], double [[TMP20]])
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP21]], i32 312, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63)
+; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]])
+; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 26:
+; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("agent") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ]
+; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]])
+; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]])
+; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.minnum.f64(double [[TMP30]], double [[TMP31]])
+; IR-DPP-NEXT: br label [[TMP33]]
+; IR-DPP: 33:
+; IR-DPP-NEXT: [[TMP34:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ]
+; IR-DPP-NEXT: ret double [[TMP34]]
;
%result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
ret double %result
@@ -1341,7 +1501,7 @@ define amdgpu_ps double @global_atomic_fmin_double_uni_address_div_value_agent_s
define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1{
; IR-ITERATIVE-LABEL: @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP26:%.*]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]]
; IR-ITERATIVE: 2:
; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -1356,26 +1516,18 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_
; IR-ITERATIVE-NEXT: br label [[TMP12]]
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast double [[TMP13]] to i64
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
-; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
-; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1
-; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[VAL]]
-; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP22]], double [[TMP24]], metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br label [[TMP26]]
-; IR-ITERATIVE: 26:
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP25]], [[TMP12]] ]
-; IR-ITERATIVE-NEXT: ret double [[TMP27]]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[VAL]]
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP14]], double [[TMP16]], metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP18]]
+; IR-ITERATIVE: 18:
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret double [[TMP19]]
;
; IR-DPP-LABEL: @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP26:%.*]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]]
; IR-DPP: 2:
; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -1390,31 +1542,101 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_
; IR-DPP-NEXT: br label [[TMP12]]
; IR-DPP: 12:
; IR-DPP-NEXT: [[TMP13:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast double [[TMP13]] to i64
-; IR-DPP-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
-; IR-DPP-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32
-; IR-DPP-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
-; IR-DPP-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
-; IR-DPP-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1
-; IR-DPP-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double
-; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP24:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[VAL]]
-; IR-DPP-NEXT: [[TMP25:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP22]], double [[TMP24]], metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: br label [[TMP26]]
-; IR-DPP: 26:
-; IR-DPP-NEXT: [[TMP27:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP25]], [[TMP12]] ]
-; IR-DPP-NEXT: ret double [[TMP27]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[VAL]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP14]], double [[TMP16]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: br label [[TMP18]]
+; IR-DPP: 18:
+; IR-DPP-NEXT: [[TMP19:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ]
+; IR-DPP-NEXT: ret double [[TMP19]]
;
%result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
ret double %result
}
define amdgpu_ps double @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1{
-; IR-LABEL: @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
-; IR-NEXT: ret double [[RESULT]]
+; IR-ITERATIVE-LABEL: @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[TMP22:%.*]] syncscope("agent") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP14]], double [[TMP21:%.*]], metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret double [[TMP17]]
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ 0xFFF0000000000000, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP21]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP19]], double [[OLDVALUEPHI]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.experimental.constrained.maxnum.f64(double [[ACCUMULATOR]], double [[TMP20]], metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1
+; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]]
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double 0xFFF0000000000000) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP9]], double [[TMP10]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP11]], double [[TMP12]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP13]], double [[TMP14]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP15]], double [[TMP16]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP17]], double [[TMP18]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP19]], double [[TMP20]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 26:
+; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("agent") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ]
+; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP30]], double [[TMP31]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: br label [[TMP33]]
+; IR-DPP: 33:
+; IR-DPP-NEXT: [[TMP34:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ]
+; IR-DPP-NEXT: ret double [[TMP34]]
;
%result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
ret double %result
@@ -1423,7 +1645,7 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_div_value_agent_
define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #2 {
; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]]
; IR-ITERATIVE: 2:
; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -1442,26 +1664,18 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_
; IR-ITERATIVE-NEXT: br label [[TMP16]]
; IR-ITERATIVE: 16:
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
-; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
-; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
-; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
-; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: br label [[TMP30]]
-; IR-ITERATIVE: 30:
-; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ]
-; IR-ITERATIVE-NEXT: ret double [[TMP31]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP17]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP18]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP22]]
+; IR-ITERATIVE: 22:
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ]
+; IR-ITERATIVE-NEXT: ret double [[TMP23]]
;
; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP22:%.*]]
; IR-DPP: 2:
; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -1480,31 +1694,101 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_
; IR-DPP-NEXT: br label [[TMP16]]
; IR-DPP: 16:
; IR-DPP-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
-; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64
-; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
-; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
-; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
-; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
-; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
-; IR-DPP-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: br label [[TMP30]]
-; IR-DPP: 30:
-; IR-DPP-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ]
-; IR-DPP-NEXT: ret double [[TMP31]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP17]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP18]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: br label [[TMP22]]
+; IR-DPP: 22:
+; IR-DPP-NEXT: [[TMP23:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP21]], [[TMP16]] ]
+; IR-DPP-NEXT: ret double [[TMP23]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4
ret double %result
}
define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 {
-; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4
-; IR-NEXT: ret double [[RESULT]]
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP16:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP22:%.*]] monotonic, align 4
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP14]], double [[TMP21:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP15]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret double [[TMP17]]
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP21]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP25:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP21]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP19]], double [[OLDVALUEPHI]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = shl i64 1, [[TMP18]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = xor i64 [[TMP23]], -1
+; IR-ITERATIVE-NEXT: [[TMP25]] = and i64 [[ACTIVEBITS]], [[TMP24]]
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP26]], label [[COMPUTEEND]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[TMP10]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 26:
+; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ]
+; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP30]], double [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: br label [[TMP33]]
+; IR-DPP: 33:
+; IR-DPP-NEXT: [[TMP34:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP32]], [[TMP28]] ]
+; IR-DPP-NEXT: ret double [[TMP34]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4
ret double %result
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
index 38823681d1bb5..d1e50bd560cb2 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
@@ -59,27 +59,25 @@ define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) #
; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
; IR-ITERATIVE: 8:
-; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP16:%.*]] seq_cst, align 4
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP14:%.*]] seq_cst, align 4
; IR-ITERATIVE-NEXT: br label [[TMP10:%.*]]
; IR-ITERATIVE: 10:
; IR-ITERATIVE-NEXT: ret void
; IR-ITERATIVE: ComputeLoop:
-; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP19:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP0:%.*]] ], [ [[TMP14]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP17:%.*]], [[COMPUTELOOP]] ]
; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
-; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32
-; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP13]], i32 [[TMP12]])
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float
-; IR-ITERATIVE-NEXT: [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]]
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]]
-; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], -1
-; IR-ITERATIVE-NEXT: [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]]
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP20]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[DIVVALUE]], i32 [[TMP12]])
+; IR-ITERATIVE-NEXT: [[TMP14]] = fadd float [[ACCUMULATOR]], [[TMP13]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = shl i64 1, [[TMP11]]
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = xor i64 [[TMP15]], -1
+; IR-ITERATIVE-NEXT: [[TMP17]] = and i64 [[ACTIVEBITS]], [[TMP16]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP18]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
; IR-ITERATIVE: ComputeEnd:
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP6]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[TMP8:%.*]], label [[TMP10]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP6]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP19]], label [[TMP8:%.*]], label [[TMP10]]
;
; IR-DPP-LABEL: @global_atomic_fadd_div_value(
; IR-DPP-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
@@ -90,32 +88,27 @@ define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) #
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]])
-; IR-DPP-NEXT: [[TMP7:%.*]] = bitcast float [[DIVVALUE]] to i32
-; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP7]], i32 -2147483648)
-; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float
-; IR-DPP-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP7]] to float
-; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP12:%.*]] = fadd float [[TMP9]], [[TMP11]]
-; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP12]], i32 274, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP14:%.*]] = fadd float [[TMP12]], [[TMP13]]
-; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 276, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP16:%.*]] = fadd float [[TMP14]], [[TMP15]]
-; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 280, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP18:%.*]] = fadd float [[TMP16]], [[TMP17]]
-; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 322, i32 10, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP20:%.*]] = fadd float [[TMP18]], [[TMP19]]
-; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 323, i32 12, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP20]], [[TMP21]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
-; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP23]], i32 63)
-; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float
-; IR-DPP-NEXT: [[TMP26:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]])
-; IR-DPP-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP6]], 0
-; IR-DPP-NEXT: br i1 [[TMP27]], label [[TMP28:%.*]], label [[TMP30:%.*]]
-; IR-DPP: 28:
-; IR-DPP-NEXT: [[TMP29:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP26]] seq_cst, align 4
-; IR-DPP-NEXT: br label [[TMP30]]
-; IR-DPP: 30:
+; IR-DPP-NEXT: [[TMP7:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[DIVVALUE]], float -0.000000e+00)
+; IR-DPP-NEXT: [[TMP8:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP7]], i32 273, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP9:%.*]] = fadd float [[TMP7]], [[TMP8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 274, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP11:%.*]] = fadd float [[TMP9]], [[TMP10]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 276, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP13:%.*]] = fadd float [[TMP11]], [[TMP12]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP13]], i32 280, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP15:%.*]] = fadd float [[TMP13]], [[TMP14]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP15]], i32 322, i32 10, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP17]], i32 323, i32 12, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP19:%.*]] = fadd float [[TMP17]], [[TMP18]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP19]], i32 63)
+; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP20]])
+; IR-DPP-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP6]], 0
+; IR-DPP-NEXT: br i1 [[TMP22]], label [[TMP23:%.*]], label [[TMP25:%.*]]
+; IR-DPP: 23:
+; IR-DPP-NEXT: [[TMP24:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP21]] seq_cst, align 4
+; IR-DPP-NEXT: br label [[TMP25]]
+; IR-DPP: 25:
; IR-DPP-NEXT: ret void
;
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
@@ -181,27 +174,25 @@ define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) #
; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
; IR-ITERATIVE: 8:
-; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP16:%.*]] seq_cst, align 4
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP14:%.*]] seq_cst, align 4
; IR-ITERATIVE-NEXT: br label [[TMP10:%.*]]
; IR-ITERATIVE: 10:
; IR-ITERATIVE-NEXT: ret void
; IR-ITERATIVE: ComputeLoop:
-; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP19:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP0:%.*]] ], [ [[TMP14]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP17:%.*]], [[COMPUTELOOP]] ]
; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
-; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32
-; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP13]], i32 [[TMP12]])
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float
-; IR-ITERATIVE-NEXT: [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]]
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]]
-; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], -1
-; IR-ITERATIVE-NEXT: [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]]
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP20]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[DIVVALUE]], i32 [[TMP12]])
+; IR-ITERATIVE-NEXT: [[TMP14]] = fadd float [[ACCUMULATOR]], [[TMP13]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = shl i64 1, [[TMP11]]
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = xor i64 [[TMP15]], -1
+; IR-ITERATIVE-NEXT: [[TMP17]] = and i64 [[ACTIVEBITS]], [[TMP16]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP18]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
; IR-ITERATIVE: ComputeEnd:
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP6]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[TMP8:%.*]], label [[TMP10]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP6]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP19]], label [[TMP8:%.*]], label [[TMP10]]
;
; IR-DPP-LABEL: @global_atomic_fsub_div_value(
; IR-DPP-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
@@ -212,32 +203,27 @@ define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) #
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]])
-; IR-DPP-NEXT: [[TMP7:%.*]] = bitcast float [[DIVVALUE]] to i32
-; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP7]], i32 -2147483648)
-; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float
-; IR-DPP-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP7]] to float
-; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP12:%.*]] = fadd float [[TMP9]], [[TMP11]]
-; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP12]], i32 274, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP14:%.*]] = fadd float [[TMP12]], [[TMP13]]
-; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 276, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP16:%.*]] = fadd float [[TMP14]], [[TMP15]]
-; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 280, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP18:%.*]] = fadd float [[TMP16]], [[TMP17]]
-; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 322, i32 10, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP20:%.*]] = fadd float [[TMP18]], [[TMP19]]
-; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 323, i32 12, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP20]], [[TMP21]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
-; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP23]], i32 63)
-; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float
-; IR-DPP-NEXT: [[TMP26:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]])
-; IR-DPP-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP6]], 0
-; IR-DPP-NEXT: br i1 [[TMP27]], label [[TMP28:%.*]], label [[TMP30:%.*]]
-; IR-DPP: 28:
-; IR-DPP-NEXT: [[TMP29:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP26]] seq_cst, align 4
-; IR-DPP-NEXT: br label [[TMP30]]
-; IR-DPP: 30:
+; IR-DPP-NEXT: [[TMP7:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[DIVVALUE]], float -0.000000e+00)
+; IR-DPP-NEXT: [[TMP8:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP7]], i32 273, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP9:%.*]] = fadd float [[TMP7]], [[TMP8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 274, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP11:%.*]] = fadd float [[TMP9]], [[TMP10]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 276, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP13:%.*]] = fadd float [[TMP11]], [[TMP12]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP13]], i32 280, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP15:%.*]] = fadd float [[TMP13]], [[TMP14]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP15]], i32 322, i32 10, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP17]], i32 323, i32 12, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP19:%.*]] = fadd float [[TMP17]], [[TMP18]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP19]], i32 63)
+; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP20]])
+; IR-DPP-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP6]], 0
+; IR-DPP-NEXT: br i1 [[TMP22]], label [[TMP23:%.*]], label [[TMP25:%.*]]
+; IR-DPP: 23:
+; IR-DPP-NEXT: [[TMP24:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP21]] seq_cst, align 4
+; IR-DPP-NEXT: br label [[TMP25]]
+; IR-DPP: 25:
; IR-DPP-NEXT: ret void
;
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
index fa66a0fdc76ce..8da8a9e9d3c61 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
@@ -49,33 +49,31 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_scope_agent_scop
; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
; IR-ITERATIVE: 10:
-; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP19:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP17:%.*]] syncscope("agent") monotonic, align 4
; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: br label [[TMP13]]
; IR-ITERATIVE: 13:
; IR-ITERATIVE-NEXT: ret void
; IR-ITERATIVE: ComputeLoop:
-; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP19]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP22:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ]
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
-; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]])
-; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
-; IR-ITERATIVE-NEXT: [[TMP19]] = fadd float [[ACCUMULATOR]], [[TMP18]]
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]]
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], -1
-; IR-ITERATIVE-NEXT: [[TMP22]] = and i64 [[ACTIVEBITS]], [[TMP21]]
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = icmp eq i64 [[TMP22]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP23]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP15]])
+; IR-ITERATIVE-NEXT: [[TMP17]] = fadd float [[ACCUMULATOR]], [[TMP16]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1
+; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
; IR-ITERATIVE: ComputeEnd:
-; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP24]], label [[TMP10:%.*]], label [[TMP12]]
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]]
;
; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]]
; IR-DPP: 2:
; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -83,34 +81,29 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_scope_agent_scop
; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648)
-; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float
-; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float
-; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP14:%.*]] = fadd float [[TMP11]], [[TMP13]]
-; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP16:%.*]] = fadd float [[TMP14]], [[TMP15]]
-; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP18:%.*]] = fadd float [[TMP16]], [[TMP17]]
-; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP20:%.*]] = fadd float [[TMP18]], [[TMP19]]
-; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP20]], [[TMP21]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP24:%.*]] = fadd float [[TMP22]], [[TMP23]]
-; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63)
-; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
-; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]])
-; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT: br i1 [[TMP29]], label [[TMP30:%.*]], label [[TMP32:%.*]]
-; IR-DPP: 30:
-; IR-DPP-NEXT: [[TMP31:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP28]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: br label [[TMP32]]
-; IR-DPP: 32:
-; IR-DPP-NEXT: br label [[TMP33]]
-; IR-DPP: 33:
+; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float -0.000000e+00)
+; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP11:%.*]] = fadd float [[TMP9]], [[TMP10]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 274, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP13:%.*]] = fadd float [[TMP11]], [[TMP12]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP13]], i32 276, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP15:%.*]] = fadd float [[TMP13]], [[TMP14]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP15]], i32 280, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP17]], i32 322, i32 10, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP19:%.*]] = fadd float [[TMP17]], [[TMP18]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP19]], i32 323, i32 12, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP21:%.*]] = fadd float [[TMP19]], [[TMP20]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63)
+; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]])
+; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]]
+; IR-DPP: 25:
+; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP23]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP27]]
+; IR-DPP: 27:
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
; IR-DPP-NEXT: ret void
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
@@ -184,33 +177,31 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_one_as_scope_uns
; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
; IR-ITERATIVE: 10:
-; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP19:%.*]] syncscope("one-as") monotonic, align 4
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP17:%.*]] syncscope("one-as") monotonic, align 4
; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: br label [[TMP13]]
; IR-ITERATIVE: 13:
; IR-ITERATIVE-NEXT: ret void
; IR-ITERATIVE: ComputeLoop:
-; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP19]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP22:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ]
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
-; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
-; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]]
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], -1
-; IR-ITERATIVE-NEXT: [[TMP22]] = and i64 [[ACTIVEBITS]], [[TMP21]]
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = icmp eq i64 [[TMP22]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP23]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP17]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1
+; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
; IR-ITERATIVE: ComputeEnd:
-; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP24]], label [[TMP10:%.*]], label [[TMP12]]
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]]
;
; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]]
; IR-DPP: 2:
; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -218,34 +209,29 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_one_as_scope_uns
; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float
-; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float
-; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP14]], float [[TMP15]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP17]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
-; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT: br i1 [[TMP29]], label [[TMP30:%.*]], label [[TMP32:%.*]]
-; IR-DPP: 30:
-; IR-DPP-NEXT: [[TMP31:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP28]] syncscope("one-as") monotonic, align 4
-; IR-DPP-NEXT: br label [[TMP32]]
-; IR-DPP: 32:
-; IR-DPP-NEXT: br label [[TMP33]]
-; IR-DPP: 33:
+; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float -0.000000e+00) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP9]], float [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP13]], float [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP15]], float [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP17]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP19]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]]
+; IR-DPP: 25:
+; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP23]] syncscope("one-as") monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP27]]
+; IR-DPP: 27:
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
; IR-DPP-NEXT: ret void
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
@@ -319,33 +305,31 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri
; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
; IR-ITERATIVE: 10:
-; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP19:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP17:%.*]] syncscope("agent") monotonic, align 4
; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: br label [[TMP13]]
; IR-ITERATIVE: 13:
; IR-ITERATIVE-NEXT: ret void
; IR-ITERATIVE: ComputeLoop:
-; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP19]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP22:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ]
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
-; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
-; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]]
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], -1
-; IR-ITERATIVE-NEXT: [[TMP22]] = and i64 [[ACTIVEBITS]], [[TMP21]]
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = icmp eq i64 [[TMP22]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP23]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP17]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1
+; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
; IR-ITERATIVE: ComputeEnd:
-; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP24]], label [[TMP10:%.*]], label [[TMP12]]
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]]
;
; IR-DPP-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]]
; IR-DPP: 2:
; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -353,34 +337,29 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri
; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float
-; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float
-; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP14]], float [[TMP15]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP17]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
-; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT: br i1 [[TMP29]], label [[TMP30:%.*]], label [[TMP32:%.*]]
-; IR-DPP: 30:
-; IR-DPP-NEXT: [[TMP31:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP28]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: br label [[TMP32]]
-; IR-DPP: 32:
-; IR-DPP-NEXT: br label [[TMP33]]
-; IR-DPP: 33:
+; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float -0.000000e+00) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP9]], float [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP13]], float [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP15]], float [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP17]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP19]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]]
+; IR-DPP: 25:
+; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP23]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP27]]
+; IR-DPP: 27:
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
; IR-DPP-NEXT: ret void
;
%result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
@@ -426,33 +405,31 @@ define amdgpu_ps void @global_atomic_fmin_uni_address_div_value_agent_scope_unsa
; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
; IR-ITERATIVE: 10:
-; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[TMP19:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[TMP17:%.*]] syncscope("agent") monotonic, align 4
; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: br label [[TMP13]]
; IR-ITERATIVE: 13:
; IR-ITERATIVE-NEXT: ret void
; IR-ITERATIVE: ComputeLoop:
-; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0x7FF0000000000000, [[TMP2]] ], [ [[TMP19]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP22:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0x7FF0000000000000, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ]
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
-; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]])
-; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
-; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.minnum.f32(float [[ACCUMULATOR]], float [[TMP18]])
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]]
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], -1
-; IR-ITERATIVE-NEXT: [[TMP22]] = and i64 [[ACTIVEBITS]], [[TMP21]]
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = icmp eq i64 [[TMP22]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP23]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP15]])
+; IR-ITERATIVE-NEXT: [[TMP17]] = call float @llvm.minnum.f32(float [[ACCUMULATOR]], float [[TMP16]])
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1
+; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
; IR-ITERATIVE: ComputeEnd:
-; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP24]], label [[TMP10:%.*]], label [[TMP12]]
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]]
;
; IR-DPP-LABEL: @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]]
; IR-DPP: 2:
; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -460,34 +437,29 @@ define amdgpu_ps void @global_atomic_fmin_uni_address_div_value_agent_scope_unsa
; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 2139095040)
-; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float
-; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float
-; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP11]], i32 273, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.minnum.f32(float [[TMP11]], float [[TMP13]])
-; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP14]], i32 274, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.minnum.f32(float [[TMP14]], float [[TMP15]])
-; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP16]], i32 276, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP17]])
-; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP18]], i32 280, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.minnum.f32(float [[TMP18]], float [[TMP19]])
-; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP20]], i32 322, i32 10, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.minnum.f32(float [[TMP20]], float [[TMP21]])
-; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP22]], i32 323, i32 12, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.minnum.f32(float [[TMP22]], float [[TMP23]])
-; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63)
-; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
-; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]])
-; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT: br i1 [[TMP29]], label [[TMP30:%.*]], label [[TMP32:%.*]]
-; IR-DPP: 30:
-; IR-DPP-NEXT: [[TMP31:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[TMP28]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: br label [[TMP32]]
-; IR-DPP: 32:
-; IR-DPP-NEXT: br label [[TMP33]]
-; IR-DPP: 33:
+; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float 0x7FF0000000000000)
+; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP9]], i32 273, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.minnum.f32(float [[TMP9]], float [[TMP10]])
+; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP11]], i32 274, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.minnum.f32(float [[TMP11]], float [[TMP12]])
+; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP13]], i32 276, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.minnum.f32(float [[TMP13]], float [[TMP14]])
+; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP15]], i32 280, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.minnum.f32(float [[TMP15]], float [[TMP16]])
+; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP17]], i32 322, i32 10, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP17]], float [[TMP18]])
+; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP19]], i32 323, i32 12, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.minnum.f32(float [[TMP19]], float [[TMP20]])
+; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63)
+; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]])
+; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]]
+; IR-DPP: 25:
+; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[TMP23]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP27]]
+; IR-DPP: 27:
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
; IR-DPP-NEXT: ret void
;
%result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
@@ -553,33 +525,31 @@ define amdgpu_ps void @global_atomic_fmax_uni_address_div_value_agent_scope_unsa
; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
; IR-ITERATIVE: 10:
-; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[TMP19:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[TMP17:%.*]] syncscope("agent") monotonic, align 4
; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: br label [[TMP13]]
; IR-ITERATIVE: 13:
; IR-ITERATIVE-NEXT: ret void
; IR-ITERATIVE: ComputeLoop:
-; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0xFFF0000000000000, [[TMP2]] ], [ [[TMP19]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP22:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0xFFF0000000000000, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ]
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
-; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
-; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.maxnum.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]]
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], -1
-; IR-ITERATIVE-NEXT: [[TMP22]] = and i64 [[ACTIVEBITS]], [[TMP21]]
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = icmp eq i64 [[TMP22]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP23]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP17]] = call float @llvm.experimental.constrained.maxnum.f32(float [[ACCUMULATOR]], float [[TMP16]], metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1
+; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
; IR-ITERATIVE: ComputeEnd:
-; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP24]], label [[TMP10:%.*]], label [[TMP12]]
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]]
;
; IR-DPP-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]]
; IR-DPP: 2:
; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -587,34 +557,29 @@ define amdgpu_ps void @global_atomic_fmax_uni_address_div_value_agent_scope_unsa
; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -8388608) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float
-; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float
-; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP11]], float [[TMP13]], metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP14]], float [[TMP15]], metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP16]], float [[TMP17]], metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP18]], float [[TMP19]], metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP20]], float [[TMP21]], metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP22]], float [[TMP23]], metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
-; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT: br i1 [[TMP29]], label [[TMP30:%.*]], label [[TMP32:%.*]]
-; IR-DPP: 30:
-; IR-DPP-NEXT: [[TMP31:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[TMP28]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: br label [[TMP32]]
-; IR-DPP: 32:
-; IR-DPP-NEXT: br label [[TMP33]]
-; IR-DPP: 33:
+; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float 0xFFF0000000000000) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP9]], float [[TMP10]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP11]], float [[TMP12]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP13]], float [[TMP14]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP15]], float [[TMP16]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP17]], float [[TMP18]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP19]], float [[TMP20]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]]
+; IR-DPP: 25:
+; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[TMP23]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP27]]
+; IR-DPP: 27:
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
; IR-DPP-NEXT: ret void
;
%result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
@@ -688,33 +653,31 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_str
; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
; IR-ITERATIVE: 10:
-; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP19:%.*]] monotonic, align 4
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP17:%.*]] monotonic, align 4
; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
; IR-ITERATIVE: 12:
; IR-ITERATIVE-NEXT: br label [[TMP13]]
; IR-ITERATIVE: 13:
; IR-ITERATIVE-NEXT: ret void
; IR-ITERATIVE: ComputeLoop:
-; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP19]], [[COMPUTELOOP]] ]
-; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP22:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ]
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
-; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
-; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]]
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], -1
-; IR-ITERATIVE-NEXT: [[TMP22]] = and i64 [[ACTIVEBITS]], [[TMP21]]
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = icmp eq i64 [[TMP22]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP23]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP17]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1
+; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
; IR-ITERATIVE: ComputeEnd:
-; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP24]], label [[TMP10:%.*]], label [[TMP12]]
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]]
;
; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_system_scope_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]]
; IR-DPP: 2:
; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
@@ -722,34 +685,29 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_str
; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float
-; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float
-; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP14]], float [[TMP15]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP17]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
-; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT: br i1 [[TMP29]], label [[TMP30:%.*]], label [[TMP32:%.*]]
-; IR-DPP: 30:
-; IR-DPP-NEXT: [[TMP31:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP28]] monotonic, align 4
-; IR-DPP-NEXT: br label [[TMP32]]
-; IR-DPP: 32:
-; IR-DPP-NEXT: br label [[TMP33]]
-; IR-DPP: 33:
+; IR-DPP-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.set.inactive.f32(float [[VAL:%.*]], float -0.000000e+00) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP9]], float [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP13]], float [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP15]], float [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP17]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP19]], float [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]]
+; IR-DPP: 25:
+; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP23]] monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP27]]
+; IR-DPP: 27:
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
; IR-DPP-NEXT: ret void
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
@@ -894,9 +852,75 @@ define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_agent_sco
}
define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 {
-; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-NEXT: ret void
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: br label [[TMP13]]
+; IR-ITERATIVE: 13:
+; IR-ITERATIVE-NEXT: ret void
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]])
+; IR-ITERATIVE-NEXT: [[TMP17]] = fadd double [[ACCUMULATOR]], [[TMP16]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1
+; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00)
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP11:%.*]] = fadd double [[TMP9]], [[TMP10]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP13:%.*]] = fadd double [[TMP11]], [[TMP12]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP15:%.*]] = fadd double [[TMP13]], [[TMP14]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP17:%.*]] = fadd double [[TMP15]], [[TMP16]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP19:%.*]] = fadd double [[TMP17]], [[TMP18]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP21:%.*]] = fadd double [[TMP19]], [[TMP20]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63)
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]])
+; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]]
+; IR-DPP: 25:
+; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP27]]
+; IR-DPP: 27:
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: ret void
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4
ret void
@@ -956,9 +980,75 @@ define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_one_as_sc
}
define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1 {
-; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8
-; IR-NEXT: ret void
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("one-as") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: br label [[TMP13]]
+; IR-ITERATIVE: 13:
+; IR-ITERATIVE-NEXT: ret void
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1
+; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]]
+; IR-DPP: 25:
+; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("one-as") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP27]]
+; IR-DPP: 27:
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: ret void
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic
ret void
@@ -1018,9 +1108,75 @@ define amdgpu_ps void @global_atomic_fsub_double_uni_address_uni_value_agent_sco
}
define amdgpu_ps void @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 {
-; IR-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
-; IR-NEXT: ret void
+; IR-ITERATIVE-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("agent") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: br label [[TMP13]]
+; IR-ITERATIVE: 13:
+; IR-ITERATIVE-NEXT: ret void
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1
+; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]]
+; IR-DPP: 25:
+; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("agent") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP27]]
+; IR-DPP: 27:
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: ret void
;
%result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
ret void
@@ -1052,9 +1208,75 @@ define amdgpu_ps void @global_atomic_fmin_double_uni_address_uni_value_agent_sco
}
define amdgpu_ps void @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 {
-; IR-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
-; IR-NEXT: ret void
+; IR-ITERATIVE-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("agent") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: br label [[TMP13]]
+; IR-ITERATIVE: 13:
+; IR-ITERATIVE-NEXT: ret void
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ 0x7FF0000000000000, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]])
+; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.minnum.f64(double [[ACCUMULATOR]], double [[TMP16]])
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1
+; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double 0x7FF0000000000000)
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP9]], i32 273, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.minnum.f64(double [[TMP9]], double [[TMP10]])
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP11]], i32 274, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.minnum.f64(double [[TMP11]], double [[TMP12]])
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP13]], i32 276, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.minnum.f64(double [[TMP13]], double [[TMP14]])
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP15]], i32 280, i32 15, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.minnum.f64(double [[TMP15]], double [[TMP16]])
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP17]], i32 322, i32 10, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.minnum.f64(double [[TMP17]], double [[TMP18]])
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF0000000000000, double [[TMP19]], i32 323, i32 12, i32 15, i1 false)
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.minnum.f64(double [[TMP19]], double [[TMP20]])
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63)
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]])
+; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]]
+; IR-DPP: 25:
+; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("agent") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP27]]
+; IR-DPP: 27:
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: ret void
;
%result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
ret void
@@ -1106,9 +1328,75 @@ define amdgpu_ps void @global_atomic_fmax_double_uni_address_uni_value_agent_sco
}
define amdgpu_ps void @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1{
-; IR-LABEL: @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
-; IR-NEXT: ret void
+; IR-ITERATIVE-LABEL: @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("agent") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: br label [[TMP13]]
+; IR-ITERATIVE: 13:
+; IR-ITERATIVE-NEXT: ret void
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ 0xFFF0000000000000, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.experimental.constrained.maxnum.f64(double [[ACCUMULATOR]], double [[TMP16]], metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1
+; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double 0xFFF0000000000000) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP9]], double [[TMP10]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP11]], double [[TMP12]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP13]], double [[TMP14]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP15]], double [[TMP16]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP17]], double [[TMP18]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0xFFF0000000000000, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP19]], double [[TMP20]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]]
+; IR-DPP: 25:
+; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("agent") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP27]]
+; IR-DPP: 27:
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: ret void
;
%result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
ret void
@@ -1168,9 +1456,75 @@ define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_system_sc
}
define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 {
-; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(
-; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4
-; IR-NEXT: ret void
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] monotonic, align 4
+; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: br label [[TMP13]]
+; IR-ITERATIVE: 13:
+; IR-ITERATIVE-NEXT: ret void
+; IR-ITERATIVE: ComputeLoop:
+; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1
+; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]]
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR-ITERATIVE: ComputeEnd:
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]]
+; IR-DPP: 25:
+; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP27]]
+; IR-DPP: 27:
+; IR-DPP-NEXT: br label [[TMP28]]
+; IR-DPP: 28:
+; IR-DPP-NEXT: ret void
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 6555ceb3ed338..b6990c8b842fd 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -706,35 +706,37 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1910,35 +1912,37 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -3114,35 +3118,37 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -3834,35 +3840,37 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -5037,35 +5045,37 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -5863,10 +5873,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -5881,24 +5891,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB10_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB10_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB10_4
+; GFX9-NEXT: .LBB10_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
@@ -5908,43 +5941,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB10_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB10_4
+; GFX1064-NEXT: .LBB10_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
@@ -5954,115 +6010,190 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB10_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB10_4
+; GFX1032-NEXT: .LBB10_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB10_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB10_4
+; GFX1164-NEXT: .LBB10_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB10_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB10_4
+; GFX1132-NEXT: .LBB10_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
@@ -6073,10 +6204,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -6091,24 +6222,83 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX9-DPP-NEXT: .LBB10_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
@@ -6118,161 +6308,367 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1064-DPP-NEXT: s_endpgm
-;
-; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
-; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1064-DPP-NEXT: .LBB10_3:
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1032-DPP-NEXT: .LBB10_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1164-DPP-NEXT: .LBB10_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1132-DPP-NEXT: .LBB10_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.float.value()
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8
@@ -6810,10 +7206,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -6828,24 +7224,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT: .LBB12_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB12_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB12_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB12_4
+; GFX9-NEXT: .LBB12_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
@@ -6855,43 +7274,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB12_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB12_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB12_4
+; GFX1064-NEXT: .LBB12_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
@@ -6901,117 +7343,192 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB12_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB12_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB12_4
+; GFX1032-NEXT: .LBB12_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB12_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB12_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB12_4
+; GFX1164-NEXT: .LBB12_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB12_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-NEXT: s_endpgm
-;
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB12_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB12_4: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB12_4
+; GFX1132-NEXT: .LBB12_5:
+; GFX1132-NEXT: s_endpgm
+;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -7020,10 +7537,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -7038,24 +7555,83 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX9-DPP-NEXT: .LBB12_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
@@ -7065,43 +7641,93 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX1064-DPP-NEXT: .LBB12_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
@@ -7111,115 +7737,271 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX1032-DPP-NEXT: .LBB12_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX1164-DPP-NEXT: .LBB12_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX1132-DPP-NEXT: .LBB12_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value() strictfp
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic
@@ -7757,10 +8539,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -7775,24 +8557,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB14_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB14_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB14_4
+; GFX9-NEXT: .LBB14_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
@@ -7802,43 +8607,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB14_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB14_4
+; GFX1064-NEXT: .LBB14_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
@@ -7848,115 +8676,190 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB14_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB14_4
+; GFX1032-NEXT: .LBB14_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB14_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB14_4
+; GFX1164-NEXT: .LBB14_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB14_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB14_4
+; GFX1132-NEXT: .LBB14_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
@@ -7967,10 +8870,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -7985,24 +8888,83 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB14_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_2
+; GFX9-DPP-NEXT: .LBB14_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
@@ -8012,43 +8974,93 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB14_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_2
+; GFX1064-DPP-NEXT: .LBB14_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
@@ -8058,115 +9070,271 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB14_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_2
+; GFX1032-DPP-NEXT: .LBB14_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB14_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_2
+; GFX1164-DPP-NEXT: .LBB14_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_2
+; GFX1132-DPP-NEXT: .LBB14_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic
@@ -8235,10 +9403,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -8253,24 +9421,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB15_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB15_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB15_4
+; GFX9-NEXT: .LBB15_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -8280,43 +9471,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB15_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB15_4
+; GFX1064-NEXT: .LBB15_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -8326,115 +9540,190 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB15_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB15_4
+; GFX1032-NEXT: .LBB15_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB15_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB15_4
+; GFX1164-NEXT: .LBB15_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB15_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB15_4
+; GFX1132-NEXT: .LBB15_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -8445,10 +9734,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -8463,24 +9752,83 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB15_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_2
+; GFX9-DPP-NEXT: .LBB15_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -8490,43 +9838,93 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB15_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_2
+; GFX1064-DPP-NEXT: .LBB15_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -8536,115 +9934,271 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB15_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_2
+; GFX1032-DPP-NEXT: .LBB15_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB15_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_2
+; GFX1164-DPP-NEXT: .LBB15_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
-; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_2
+; GFX1132-DPP-NEXT: .LBB15_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.float.value() strictfp
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic
@@ -9182,10 +10736,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -9200,24 +10754,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB17_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB17_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB17_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB17_4
+; GFX9-NEXT: .LBB17_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
@@ -9227,43 +10804,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB17_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB17_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB17_4
+; GFX1064-NEXT: .LBB17_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
@@ -9273,115 +10873,190 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB17_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB17_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB17_4
+; GFX1032-NEXT: .LBB17_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB17_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB17_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB17_4
+; GFX1164-NEXT: .LBB17_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB17_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB17_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB17_4
+; GFX1132-NEXT: .LBB17_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
@@ -9392,10 +11067,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -9410,24 +11085,83 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_2
+; GFX9-DPP-NEXT: .LBB17_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
@@ -9437,43 +11171,93 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_2
+; GFX1064-DPP-NEXT: .LBB17_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
@@ -9483,115 +11267,271 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_2
+; GFX1032-DPP-NEXT: .LBB17_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_2
+; GFX1164-DPP-NEXT: .LBB17_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_2
+; GFX1132-DPP-NEXT: .LBB17_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.float.value() strictfp
%result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue monotonic, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 7f052e1ae8b55..f512f17bbbcbf 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -607,42 +607,44 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000
-; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-DPP-NEXT: v_max_f32_e32 v3, v4, v3
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1726,42 +1728,44 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000
-; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-DPP-NEXT: v_max_f32_e32 v3, v4, v3
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -2905,42 +2909,44 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000
-; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-DPP-NEXT: v_max_f32_e32 v3, v4, v3
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -3689,10 +3695,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -3707,26 +3713,51 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX9-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB7_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB7_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB7_4
+; GFX9-NEXT: .LBB7_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
@@ -3736,29 +3767,55 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0xfff00000
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB7_4
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_atomic_fmax_x2 v0, v[2:3], s[0:1]
+; GFX1064-NEXT: .LBB7_4:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
@@ -3768,107 +3825,191 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0xfff00000
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB7_4
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_atomic_fmax_x2 v0, v[2:3], s[0:1]
+; GFX1032-NEXT: .LBB7_4:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB7_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB7_4
+; GFX1164-NEXT: .LBB7_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB7_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB7_4
+; GFX1132-NEXT: .LBB7_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
@@ -3879,10 +4020,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -3897,26 +4038,92 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1]
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[1:2]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX9-DPP-NEXT: .LBB7_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
@@ -3926,29 +4133,88 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3]
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_2
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT: .LBB7_2:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
@@ -3958,107 +4224,288 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_2
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT: .LBB7_2:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1164-DPP-NEXT: s_endpgm
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xfff00000
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1164-DPP-NEXT: .LBB7_3:
+; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0xfff00000
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1132-DPP-NEXT: .LBB7_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
%result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8
@@ -4484,10 +4931,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -4502,26 +4949,51 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX9-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX9-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB9_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB9_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB9_4
+; GFX9-NEXT: .LBB9_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
@@ -4531,45 +5003,70 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB9_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB9_4
+; GFX1064-NEXT: .LBB9_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
@@ -4579,123 +5076,206 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB9_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB9_4
+; GFX1032-NEXT: .LBB9_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB9_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB9_4
+; GFX1164-NEXT: .LBB9_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB9_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB9_4
+; GFX1132-NEXT: .LBB9_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
@@ -4706,10 +5286,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -4724,26 +5304,92 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1]
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[1:2]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX9-DPP-NEXT: .LBB9_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
@@ -4753,45 +5399,103 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3]
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[0:1], v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-DPP-NEXT: v_max_f64 v[13:14], v[11:12], v[11:12]
+; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[13:14], v[9:10]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1064-DPP-NEXT: .LBB9_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
@@ -4801,123 +5505,303 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1032-DPP-NEXT: .LBB9_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xfff00000
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1164-DPP-NEXT: .LBB9_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0xfff00000
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1132-DPP-NEXT: .LBB9_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
%result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic
@@ -5343,10 +6227,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -5361,26 +6245,51 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX9-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB11_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB11_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB11_4
+; GFX9-NEXT: .LBB11_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
@@ -5390,45 +6299,70 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB11_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB11_4
+; GFX1064-NEXT: .LBB11_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
@@ -5438,123 +6372,206 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB11_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB11_4
+; GFX1032-NEXT: .LBB11_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB11_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB11_4
+; GFX1164-NEXT: .LBB11_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_mov_b32_e32 v5, 0xfff00000
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB11_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB11_4
+; GFX1132-NEXT: .LBB11_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
@@ -5565,10 +6582,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -5583,26 +6600,92 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1]
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[1:2]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX9-DPP-NEXT: .LBB11_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
@@ -5612,45 +6695,103 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3]
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[0:1], v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-DPP-NEXT: v_max_f64 v[13:14], v[11:12], v[11:12]
+; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[13:14], v[9:10]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1064-DPP-NEXT: .LBB11_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
@@ -5660,123 +6801,303 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0xfff00000
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1032-DPP-NEXT: .LBB11_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xfff00000
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1164-DPP-NEXT: .LBB11_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0xfff00000
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1132-DPP-NEXT: .LBB11_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
%result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue monotonic, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index a9f49ad1508d5..c3b3079db3adc 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -607,42 +607,44 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7f800000
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7f800000
-; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-DPP-NEXT: v_min_f32_e32 v3, v4, v3
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1726,42 +1728,44 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7f800000
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7f800000
-; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-DPP-NEXT: v_min_f32_e32 v3, v4, v3
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -2905,42 +2909,44 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7f800000
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7f800000
-; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX9-DPP-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-DPP-NEXT: v_min_f32_e32 v3, v4, v3
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -3689,10 +3695,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -3707,26 +3713,51 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX9-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB7_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB7_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB7_4
+; GFX9-NEXT: .LBB7_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
@@ -3736,29 +3767,55 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff00000
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB7_4
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_atomic_fmin_x2 v0, v[2:3], s[0:1]
+; GFX1064-NEXT: .LBB7_4:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
@@ -3768,107 +3825,191 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff00000
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB7_4
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_atomic_fmin_x2 v0, v[2:3], s[0:1]
+; GFX1032-NEXT: .LBB7_4:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB7_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB7_4
+; GFX1164-NEXT: .LBB7_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB7_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB7_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB7_4
+; GFX1132-NEXT: .LBB7_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
@@ -3879,10 +4020,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -3897,26 +4038,92 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1]
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX9-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[1:2]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX9-DPP-NEXT: .LBB7_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
@@ -3926,29 +4133,88 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3]
+; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_2
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT: .LBB7_2:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
@@ -3958,107 +4224,288 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_2
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT: .LBB7_2:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1164-DPP-NEXT: s_endpgm
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff00000
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1164-DPP-NEXT: .LBB7_3:
+; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff00000
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1132-DPP-NEXT: .LBB7_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
%result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8
@@ -4484,10 +4931,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -4502,26 +4949,51 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX9-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX9-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB9_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB9_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB9_4
+; GFX9-NEXT: .LBB9_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
@@ -4531,45 +5003,70 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB9_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB9_4
+; GFX1064-NEXT: .LBB9_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
@@ -4579,123 +5076,206 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB9_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB9_4
+; GFX1032-NEXT: .LBB9_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB9_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB9_4
+; GFX1164-NEXT: .LBB9_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB9_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB9_4
+; GFX1132-NEXT: .LBB9_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
@@ -4706,10 +5286,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -4724,26 +5304,92 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1]
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX9-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[1:2]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX9-DPP-NEXT: .LBB9_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
@@ -4753,45 +5399,103 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3]
+; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[0:1], v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-DPP-NEXT: v_max_f64 v[13:14], v[11:12], v[11:12]
+; GFX1064-DPP-NEXT: v_min_f64 v[9:10], v[13:14], v[9:10]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1064-DPP-NEXT: .LBB9_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
@@ -4801,123 +5505,303 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX1032-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1032-DPP-NEXT: .LBB9_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff00000
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1164-DPP-NEXT: .LBB9_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff00000
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1132-DPP-NEXT: .LBB9_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
%result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic
@@ -5343,10 +6227,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -5361,26 +6245,51 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX9-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB11_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB11_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB11_4
+; GFX9-NEXT: .LBB11_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
@@ -5390,45 +6299,70 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB11_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB11_4
+; GFX1064-NEXT: .LBB11_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
@@ -5438,123 +6372,206 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB11_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB11_4
+; GFX1032-NEXT: .LBB11_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB11_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB11_4
+; GFX1164-NEXT: .LBB11_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB11_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB11_4
+; GFX1132-NEXT: .LBB11_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
@@ -5565,10 +6582,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -5583,26 +6600,92 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1]
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX9-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[1:2]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX9-DPP-NEXT: .LBB11_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
@@ -5612,45 +6695,103 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5]
+; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3]
+; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[0:1], v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-DPP-NEXT: v_max_f64 v[13:14], v[11:12], v[11:12]
+; GFX1064-DPP-NEXT: v_min_f64 v[9:10], v[13:14], v[9:10]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1064-DPP-NEXT: .LBB11_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
@@ -5660,123 +6801,303 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff00000
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX1032-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1032-DPP-NEXT: .LBB11_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff00000
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1164-DPP-NEXT: .LBB11_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
-; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff00000
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11]
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1132-DPP-NEXT: .LBB11_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
%result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue monotonic, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 5cb57703c01d9..8664fdf242036 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -784,35 +784,37 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -2014,35 +2016,37 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -3244,35 +3248,37 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -4016,35 +4022,37 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -5245,35 +5253,37 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -6071,10 +6081,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -6089,24 +6099,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB10_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB10_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB10_4
+; GFX9-NEXT: .LBB10_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
@@ -6116,43 +6149,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB10_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB10_4
+; GFX1064-NEXT: .LBB10_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
@@ -6162,115 +6218,190 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB10_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB10_4
+; GFX1032-NEXT: .LBB10_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB10_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB10_4
+; GFX1164-NEXT: .LBB10_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB10_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB10_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB10_4
+; GFX1132-NEXT: .LBB10_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
@@ -6281,10 +6412,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -6299,24 +6430,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX9-DPP-NEXT: .LBB10_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
@@ -6326,161 +6516,367 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1064-DPP-NEXT: s_endpgm
-;
-; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
-; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
-; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1064-DPP-NEXT: .LBB10_3:
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1032-DPP-NEXT: .LBB10_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1164-DPP-NEXT: .LBB10_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1132-DPP-NEXT: .LBB10_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.float.value()
%result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8
@@ -7017,10 +7413,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -7035,24 +7431,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT: .LBB12_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB12_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB12_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB12_4
+; GFX9-NEXT: .LBB12_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
@@ -7062,43 +7481,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB12_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB12_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB12_4
+; GFX1064-NEXT: .LBB12_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
@@ -7108,116 +7550,191 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB12_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB12_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB12_4
+; GFX1032-NEXT: .LBB12_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB12_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB12_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB12_4
+; GFX1164-NEXT: .LBB12_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB12_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-NEXT: s_endpgm
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB12_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB12_4: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB12_4
+; GFX1132-NEXT: .LBB12_5:
+; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
@@ -7227,10 +7744,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -7245,24 +7762,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX9-DPP-NEXT: .LBB12_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
@@ -7272,43 +7848,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX1064-DPP-NEXT: .LBB12_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
@@ -7318,115 +7944,271 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX1032-DPP-NEXT: .LBB12_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX1164-DPP-NEXT: .LBB12_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX1132-DPP-NEXT: .LBB12_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value() strictfp
%result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic
@@ -7964,10 +8746,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -7982,24 +8764,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB14_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB14_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB14_4
+; GFX9-NEXT: .LBB14_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
@@ -8009,43 +8814,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB14_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB14_4
+; GFX1064-NEXT: .LBB14_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
@@ -8055,115 +8883,190 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB14_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB14_4
+; GFX1032-NEXT: .LBB14_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB14_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB14_4
+; GFX1164-NEXT: .LBB14_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB14_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB14_4
+; GFX1132-NEXT: .LBB14_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
@@ -8174,10 +9077,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
@@ -8192,24 +9095,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB14_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_2
+; GFX9-DPP-NEXT: .LBB14_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
@@ -8219,43 +9181,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB14_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_2
+; GFX1064-DPP-NEXT: .LBB14_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
@@ -8265,115 +9277,271 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB14_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_2
+; GFX1032-DPP-NEXT: .LBB14_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB14_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_2
+; GFX1164-DPP-NEXT: .LBB14_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_2
+; GFX1132-DPP-NEXT: .LBB14_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
%result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic
@@ -8442,10 +9610,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -8460,24 +9628,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB15_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB15_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB15_4
+; GFX9-NEXT: .LBB15_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -8487,44 +9678,67 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1064-NEXT: s_endpgm
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB15_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB15_4: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB15_4
+; GFX1064-NEXT: .LBB15_5:
+; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
@@ -8533,115 +9747,190 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB15_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB15_4
+; GFX1032-NEXT: .LBB15_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB15_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB15_4
+; GFX1164-NEXT: .LBB15_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB15_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB15_4
+; GFX1132-NEXT: .LBB15_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -8652,10 +9941,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -8670,24 +9959,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB15_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_2
+; GFX9-DPP-NEXT: .LBB15_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -8697,43 +10045,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB15_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_2
+; GFX1064-DPP-NEXT: .LBB15_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
@@ -8743,115 +10141,271 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB15_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_2
+; GFX1032-DPP-NEXT: .LBB15_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB15_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_2
+; GFX1164-DPP-NEXT: .LBB15_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_2
+; GFX1132-DPP-NEXT: .LBB15_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.float.value() strictfp
%result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic
@@ -9388,10 +10942,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_add_u32 s8, s2, 44
-; GFX9-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -9406,24 +10960,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s3, v1, s4
+; GFX9-NEXT: v_readlane_b32 s2, v0, s4
+; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB17_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT: .LBB17_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB17_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB17_4
+; GFX9-NEXT: .LBB17_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
@@ -9433,43 +11010,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s14, s8
-; GFX1064-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s13, s7
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB17_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1064-NEXT: .LBB17_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB17_4
+; GFX1064-NEXT: .LBB17_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
@@ -9479,115 +11079,190 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s14, s8
-; GFX1032-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s6
-; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s13, s7
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB17_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX1032-NEXT: .LBB17_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB17_4
+; GFX1032-NEXT: .LBB17_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
-; GFX1164-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
+; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB17_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1164-NEXT: .LBB17_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB17_4
+; GFX1164-NEXT: .LBB17_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB17_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
+; GFX1132-NEXT: .LBB17_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB17_4
+; GFX1132-NEXT: .LBB17_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
@@ -9598,10 +11273,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX9-DPP-NEXT: s_mov_b32 s14, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
@@ -9616,24 +11291,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_mov_b32 s13, s7
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
+; GFX9-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_2
+; GFX9-DPP-NEXT: .LBB17_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
@@ -9643,43 +11377,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_2
+; GFX1064-DPP-NEXT: .LBB17_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
@@ -9689,115 +11473,271 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_2
+; GFX1032-DPP-NEXT: .LBB17_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1164-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_2
+; GFX1164-DPP-NEXT: .LBB17_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
+; GFX1132-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_2
+; GFX1132-DPP-NEXT: .LBB17_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.float.value() strictfp
%result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue monotonic, align 8
>From b4f019839670aded424c8b1bf35ba5237384842d Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Wed, 26 Jun 2024 04:04:50 -0400
Subject: [PATCH 11/11] review comments
---
.../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 34 ++++++++++++-------
1 file changed, 22 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index f204522293e6c..6d289828416b2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -178,6 +178,20 @@ bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
return Changed;
}
+static bool shouldOptimize(Type *Ty) {
+ switch (Ty->getTypeID()) {
+ case Type::FloatTyID:
+ case Type::DoubleTyID:
+ return true;
+ case Type::IntegerTyID: {
+ if (Ty->getIntegerBitWidth() == 32 || Ty->getIntegerBitWidth() == 64)
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
// Early exit for unhandled address space atomic instructions.
switch (I.getPointerAddressSpace()) {
@@ -227,12 +241,10 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
// If the value operand is divergent, each lane is contributing a different
- // value to the atomic calculation. We can only optimize divergent values if
- // we have DPP available on our subtarget, and the atomic operation is either
- // 32 or 64 bits.
- if (ValDivergent &&
- (!ST->hasDPP() || (DL->getTypeSizeInBits(I.getType()) != 32 &&
- DL->getTypeSizeInBits(I.getType()) != 64))) {
+ // value to the atomic calculation. We only optimize divergent values if
+ // we have DPP available on our subtarget, and the atomic operation is of
+ // 32/64 bit integer, float or double type.
+ if (ValDivergent && (!ST->hasDPP() || !shouldOptimize(I.getType()))) {
return;
}
@@ -311,12 +323,10 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
// If the value operand is divergent, each lane is contributing a different
- // value to the atomic calculation. We can only optimize divergent values if
- // we have DPP available on our subtarget, and the atomic operation is 32 or
- // 64 bits.
- if (ValDivergent &&
- (!ST->hasDPP() || (DL->getTypeSizeInBits(I.getType()) != 32 &&
- DL->getTypeSizeInBits(I.getType()) != 64))) {
+ // value to the atomic calculation. We only optimize divergent values if
+ // we have DPP available on our subtarget, and the atomic operation is of
+ // 32/64 bit integer, float or double type.
+ if (ValDivergent && (!ST->hasDPP() || !shouldOptimize(I.getType()))) {
return;
}
More information about the cfe-commits
mailing list